In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

In [None]:
# Load the dataset
file_path = '../src/combined_solar_data.csv'

try:
    data = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    print(data.head())  # Inspect the first few rows
except FileNotFoundError:
    print(f"File not found at: {file_path}. Please check the file path.")

In [None]:
# Summary Statistics
summary_stats = data.describe()
print("Summary Statistics:")
print(summary_stats)

In [None]:
# Data Quality Check
missing_values = data.isnull().sum()
negative_values = (data.select_dtypes(include=[np.number]) < 0).sum()
print("\nMissing Values:")
print(missing_values)
print("\nNegative Values:")
print(negative_values)

In [None]:
# Fill missing values for numeric columns only
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
print("Missing values filled for numeric columns.")

In [None]:
# Detecting outliers using Z-score
z_scores = zscore(data.select_dtypes(include=[np.number]))
outliers = np.where(np.abs(z_scores) > 3)
print("\nNumber of outliers:", len(outliers[0]))

In [None]:
# Time Series Analysis
# Time Series Analysis - Downsample if necessary for better performance
downsampled_data = data.iloc[::100, :]  # Plot every 100th data point

# Plotting
plt.figure(figsize=(15, 6))
plt.plot(downsampled_data['Timestamp'], downsampled_data['GHI'], label='GHI')
plt.plot(downsampled_data['Timestamp'], downsampled_data['DNI'], label='DNI')
plt.plot(downsampled_data['Timestamp'], downsampled_data['DHI'], label='DHI')
plt.plot(downsampled_data['Timestamp'], downsampled_data['Tamb'], label='Tamb')
plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Time Series Analysis of Solar Radiation and Temperature')
plt.legend()
plt.show()

In [None]:
# Impact of Cleaning on Sensor Readings
plt.figure(figsize=(15, 6))
sns.boxplot(x=data['Cleaning'], y=data['ModA'])
plt.title('Impact of Cleaning on Sensor ModA')
plt.show()

plt.figure(figsize=(15, 6))
sns.boxplot(x=data['Cleaning'], y=data['ModB'])
plt.title('Impact of Cleaning on Sensor ModB')
plt.show()

In [None]:
# Correlation Analysis
# Ensure only numeric columns are included for correlation calculation
numeric_data = data.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr_matrix = numeric_data.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Wind Analysis
sns.scatterplot(x=data['WD'], y=data['WS'], hue=data['Cleaning'])
plt.title('Wind Direction vs Wind Speed')
plt.show()

In [None]:
# Temperature Analysis
sns.scatterplot(x=data['RH'], y=data['Tamb'], hue=data['GHI'], palette='viridis')
plt.title('Relative Humidity vs Ambient Temperature')
plt.show()

In [None]:
# Histograms
columns_to_plot = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns_to_plot, 1):
    plt.subplot(3, 2, i)
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f'Histogram of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Bubble Chart
plt.figure(figsize=(10, 6))
bubble = plt.scatter(
    data['GHI'], data['Tamb'], s=data['RH'], c=data['BP'], alpha=0.5, cmap='viridis')
plt.colorbar(bubble, label='Barometric Pressure')
plt.xlabel('GHI')
plt.ylabel('Tamb')
plt.title('GHI vs Tamb with RH as Bubble Size and BP as Color')
plt.show()

In [None]:
# Clean up Comments column if entirely null
data.drop(columns=['Comments'], inplace=True, errors='ignore')

# Save cleaned data
data.to_csv('cleaned_solar_data.csv', index=False)