In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

country = "sierra_leone" 
data_path = r"C:\Users\admin\solar-challenge-week0\data\sierraleone-bumbuna.csv"

df = pd.read_csv(data_path)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.head()

# Summary statistics
print(df.describe())

# Check for missing values
missing = df.isna().sum()
print(missing[missing>0])

# Percentage of missing values
print((df.isna().sum()/len(df))*100)

high_nulls = (df.isna().mean() * 100)[(df.isna().mean() * 100) > 5]
print("Columns with >5% missing values:\n", high_nulls)

# Columns to check
numeric_cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']

# Compute Z-scores
z_scores = np.abs(stats.zscore(df[numeric_cols]))
outliers = (z_scores > 3).any(axis=1)

print(f"Outliers detected: {outliers.sum()} rows")

# Remove outliers
df_clean = df[~outliers].copy()

# Impute missing values (median)
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())

df_clean.to_csv(r"C:\Users\admin\solar-challenge-week0\data\sierraleone-bumbuna_clean.csv", index=False)

plt.figure(figsize=(12,6))
plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI')
plt.plot(df_clean['Timestamp'], df_clean['DNI'], label='DNI')
plt.plot(df_clean['Timestamp'], df_clean['DHI'], label='DHI')
plt.legend()
plt.title(f"{country.capitalize()} Solar Radiation")
plt.xlabel("Time")
plt.ylabel("W/m¬≤")
plt.show()

cleaning_effect = df_clean.groupby('Cleaning')[['ModA','ModB']].mean()
cleaning_effect.plot(kind='bar', figsize=(8,6))
plt.title(f"{country.capitalize()} Cleaning Impact")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df_clean[['GHI','DNI','DHI','TModA','TModB','Tamb','RH']].corr(), annot=True, cmap='coolwarm')
plt.title(f"{country.capitalize()} Correlation Heatmap")
plt.show()

# Scatter plots
sns.scatterplot(data=df_clean, x='WS', y='GHI')
sns.scatterplot(data=df_clean, x='RH', y='Tamb')
plt.show()
fig, ax = plt.subplots(1,2, figsize=(12,5))
sns.histplot(df_clean['GHI'], kde=True, ax=ax[0])
ax[0].set_title("GHI Distribution")
sns.histplot(df_clean['WS'], kde=True, ax=ax[1])
ax[1].set_title("Wind Speed Distribution")
plt.suptitle(f"{country.capitalize()} - Distribution Analysis")
plt.show()

plt.figure(figsize=(8,5))
sns.scatterplot(data=df_clean, x='RH', y='Tamb', alpha=0.5)
plt.title(f"{country.capitalize()} - RH vs Ambient Temperature")
plt.show()

correlation = df_clean['RH'].corr(df_clean['Tamb'])
print(f"Correlation between RH and Tamb: {correlation:.3f}")


# Wind rose (simplified)
plt.figure(figsize=(8,6))
plt.hist(df_clean['WD'], bins=16)
plt.title(f"{country.capitalize()} Wind Direction Distribution")
plt.show()

# Solar radiation vs. temperature
plt.scatter(df_clean['Tamb'], df_clean['GHI'], s=df_clean['RH'], alpha=0.5)
plt.xlabel("Ambient Temperature (¬∞C)")
plt.ylabel("GHI (W/m¬≤)")
plt.title(f"{country.capitalize()} GHI vs Temperature (Bubble = RH)")
plt.show()

from scipy.stats import skew, kurtosis
for col in ['GHI','DNI','DHI']:
    print(f"{col} - Skew: {skew(df_clean[col]):.2f}, Kurtosis: {kurtosis(df_clean[col]):.2f}")
plt.Figure(figsize=(8,6))

print(f"""
üîç {country.capitalize()} Dataset Summary:
- Total Records: {len(df)}
- Cleaned Records: {len(df_clean)}
- Missing Values Fixed: {missing.sum()}
- Outliers Removed: {outliers.sum()}
- Correlation (RH vs Tamb): {correlation:.2f}

‚úÖ Data ready for modeling and region comparison.
""")    is this correct answer in short