In [None]:
import os

def merge_files(raw_folder, output_file):
    electricity_folder = os.path.join(raw_folder, "electricity_raw_data")
    weather_folder = os.path.join(raw_folder, "weather_raw_data")

    electricity_files = [os.path.join(electricity_folder, f) for f in os.listdir(electricity_folder) if f.endswith('.json')]
    weather_files = [os.path.join(weather_folder, f) for f in os.listdir(weather_folder) if f.endswith('.csv')]

    electricity_data = []
    weather_data = []

    for file in electricity_files:
        with open(file, 'r') as f:
            data = pd.json_normalize(pd.read_json(f)['response']['data'])
            data['datetime'] = pd.to_datetime(data['period'], format='%Y-%m-%dT%H')
            data.drop(columns=['period'], inplace=True)
            electricity_data.append(data)

    for file in weather_files:
        data = pd.read_csv(file)
        data.rename(columns={'date': 'datetime'}, inplace=True)
        data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')
        weather_data.append(data)

    if electricity_data and weather_data:
        electricity_df = pd.concat(electricity_data, ignore_index=True)
        weather_df = pd.concat(weather_data, ignore_index=True)

        if weather_df['datetime'].dtype == 'object':
            weather_df['datetime'] = pd.to_datetime(weather_df['datetime'], errors='coerce')

        weather_df['datetime'] = weather_df['datetime'].dropna().dt.tz_localize(None)

        merged_df = pd.merge(electricity_df, weather_df, on='datetime', how='inner')
        merged_df.to_csv(output_file, index=False)
        print(f"Merged file saved at: {output_file}")
    else:
        print("No valid files found in raw folder.")

# Example usage
raw_folder_path = "raw"  # Update this to the actual path
output_file_path = "merged_data.csv"
merge_files(raw_folder_path, output_file_path)

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
input_file = "merged_data.csv"
df = pd.read_csv(input_file)

# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Log original record count
print(f"Total records before cleaning: {df.shape[0]}")

# 1. Identifying Missing Data
missing_data = df.isnull().sum() / len(df) * 100
print("Missing Data (%):\n", missing_data)

# 3. Handling Missing Data
threshold = 50  # Drop columns with >50% missing data
df = df.dropna(axis=1, thresh=len(df) * (threshold / 100))

# Fill missing values in numerical columns with the median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill missing values in categorical columns with the mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# 4. Data Type Conversions
# Convert 'datetime' column to datetime format
if "datetime" in df.columns:
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce", utc=True)

# Ensure numerical columns are properly cast
for col in df.select_dtypes(include=["object"]).columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        df[col] = df[col].astype("category")

# 5. Handling Duplicates
duplicate_count = df.duplicated().sum()
print(f"Total duplicate records: {duplicate_count}")
df.drop_duplicates(inplace=True)
print(f"Total records after removing duplicates: {df.shape[0]}")

# 6. Feature Engineering
if "datetime" in df.columns:
    df["hour"] = df["datetime"].dt.hour
    df["day"] = df["datetime"].dt.day
    df["month"] = df["datetime"].dt.month
    df["year"] = df["datetime"].dt.year
    df["weekday"] = df["datetime"].dt.weekday
    df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)

    # Remove 'season' categorical column
    # Create separate season binary columns instead
    df["season_Winter"] = df["month"].isin([12, 1, 2]).astype(int)
    df["season_Spring"] = df["month"].isin([3, 4, 5]).astype(int)
    df["season_Summer"] = df["month"].isin([6, 7, 8]).astype(int)
    df["season_Fall"] = df["month"].isin([9, 10, 11]).astype(int)

# 7. One-Hot Encoding with Integer Conversion
def one_hot_encode(df, column_name, drop_first=False):
    """ One-hot encodes a categorical column and converts new dummy columns to int. """
    if column_name in df.columns:
        df = pd.get_dummies(df, columns=[column_name], drop_first=drop_first)  # Drop first category if needed
        dummy_cols = [col for col in df.columns if column_name in col]  # Find newly created columns
        df[dummy_cols] = df[dummy_cols].astype(int)  # Convert to integers
    return df

# Apply encoding to categorical features
df = one_hot_encode(df, "Province", drop_first=True)  # Avoid dummy variable trap

# 8. Convert "subba-name" into True/False Features (Binary Columns)
if "subba-name" in df.columns:
    subba_dummies = pd.get_dummies(df["subba-name"], prefix="subba", dtype=int)
    df = pd.concat([df, subba_dummies], axis=1)
    df.drop(columns=["subba-name"], inplace=True)  # Drop original column

# 9. Convert "weather_name" into 4 Separate Binary Columns (No Drop)
if "weather_name" in df.columns:
    weather_dummies = pd.get_dummies(df["weather_name"], prefix="weather", dtype=int)
    df = pd.concat([df, weather_dummies], axis=1)
    df.drop(columns=["weather_name"], inplace=True)  # Drop original column

# 10. Normalizing Numerical Features (Optional)
# scaler = StandardScaler()
# numerical_cols = df.select_dtypes(include=[np.number]).columns
# df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Save cleaned data
output_csv = "cleaned_data.csv"
df.to_csv(output_csv, index=False)
print(f"Cleaned data saved as {output_csv}")


In [None]:
import scipy.stats as stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path, parse_dates=['datetime'])
    return df

def compute_statistics(df):
    """Compute key statistical metrics for numerical features."""
    stats_summary = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        stats_summary[column] = {
            'Mean': np.mean(df[column]),
            'Median': np.median(df[column]),
            'Standard Deviation': np.std(df[column], ddof=1),
            'Skewness': stats.skew(df[column], nan_policy='omit'),
            'Kurtosis': stats.kurtosis(df[column], nan_policy='omit')
        }

    return pd.DataFrame(stats_summary)

def plot_time_series(df):
    """Plot electricity demand over time."""
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='datetime', y='value', label="Electricity Demand", color="blue")
    plt.xlabel("Time")
    plt.ylabel("Electricity Demand (MWh)")
    plt.title("Electricity Demand Over Time")
    plt.legend()
    plt.grid(True)
    plt.show()

def univariate_analysis(df, column):
    """Performs univariate analysis on a specified numerical column."""
    if column not in df.columns:
        print(f"Column '{column}' not found in dataset!")
        return

    sns.set_style("whitegrid")
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    sns.histplot(df[column], bins=30, kde=True, ax=axes[0], color='blue')
    axes[0].set_title(f'Histogram of {column}')

    sns.boxplot(y=df[column], ax=axes[1], color='green')
    axes[1].set_title(f'Boxplot of {column}')

    sns.kdeplot(df[column], fill=True, color='red', ax=axes[2])
    axes[2].set_title(f'Density Plot of {column}')

    plt.tight_layout()
    plt.show()

    print(df[column].describe())
    print(f"\nSkewness: {df[column].skew():.4f}")
    print(f"Kurtosis: {df[column].kurt():.4f}")

def correlation_analysis(df):
    """Computes and visualizes the correlation matrix for numerical features."""
    numerical_df = df.select_dtypes(include=['number'])
    correlation_matrix = numerical_df.corr()

    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Correlation Matrix Heatmap")
    plt.show()

def time_series_analysis(df):
    """Performs time series decomposition and stationarity test."""
    df = df.set_index('datetime').resample('H').mean().ffill()

    decomposition = seasonal_decompose(df['value'], model='additive', period=24)

    plt.figure(figsize=(10, 8))
    plt.subplot(411)
    plt.plot(df['value'], label='Original Time Series')
    plt.legend()
    plt.subplot(412)
    plt.plot(decomposition.trend, label='Trend', color='green')
    plt.legend()
    plt.subplot(413)
    plt.plot(decomposition.seasonal, label='Seasonality', color='orange')
    plt.legend()
    plt.subplot(414)
    plt.plot(decomposition.resid, label='Residuals', color='red')
    plt.legend()
    plt.tight_layout()
    plt.show()

    print("\nPerforming Augmented Dickey-Fuller Test...")
    adf_test = adfuller(df['value'].dropna())
    print(f"ADF Statistic: {adf_test[0]:.4f}")
    print(f"p-value: {adf_test[1]:.4f}")
    for key, value in adf_test[4].items():
        print(f"Critical Value ({key}): {value:.4f}")
    if adf_test[1] < 0.05:
        print("\nConclusion: The time series is stationary.")
    else:
        print("\nConclusion: The time series is NOT stationary.")

def main():
    file_path = 'cleaned_data.csv'
    df = load_data(file_path)

    print("Statistical Summary:")
    print(compute_statistics(df))

    plot_time_series(df)

    numerical_columns = ['value', 'temperature_2m', 'hour', 'day', 'month', 'year', 'weekday', 'is_weekend']
    for col in numerical_columns:
        print(f"\n{'='*40}\nUnivariate Analysis: {col}\n{'='*40}")
        univariate_analysis(df, col)

    correlation_analysis(df)

    time_series_analysis(df)

if __name__ == "__main__":
    main()

In [None]:
def load_data(file_path):
    """Loads dataset from a CSV file."""
    return pd.read_csv(file_path)

def remove_duplicates(df):
    """Removes duplicate rows from the dataset."""
    return df.drop_duplicates()

def normalize_value_column(df):
    """Applies log transformation to normalize the 'value' column, handling negatives and NaNs."""
    if 'value' in df.columns:
        df['value'] = np.log1p(df['value'].clip(lower=0)).fillna(0)
    return df

def main():
    file_path = "cleaned_data.csv"  # Update this with the actual path
    df = load_data(file_path)

    # Remove duplicates
    df = remove_duplicates(df)

    # Normalize the 'value' column
    df = normalize_value_column(df)

    # Save cleaned dataset
    df.to_csv("cleaned_normalized_data.csv", index=False)
    print("Cleaned dataset with normalized 'value' column saved as 'cleaned_normalized_data.csv'")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def load_data(file_path):
    """Loads dataset from a CSV file."""
    return pd.read_csv(file_path)

def train_regression_model(df):
    """Trains a linear regression model, conducts residual analysis, and plots residuals."""

    # Base features
    base_features = ['hour', 'day', 'month', 'weekday', 'temperature_2m']

    # Seasonal features
    season_features = ['season_Winter', 'season_Spring', 'season_Summer', 'season_Fall']

    # Subba features (all columns that start with 'subba_')
    subba_features = [col for col in df.columns if col.startswith('subba_')]

    # Combine all features
    features = base_features + season_features + subba_features
    target = 'value'

    # Ensure required columns exist
    if not all(col in df.columns for col in features + [target]):
        print("Some required columns are missing in the dataset.")
        return

    X = df[features]
    y = df[target]

    # Handling missing values (fill missing data with column means)
    X = X.fillna(X.mean())
    y = y.fillna(y.mean())

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on test data
    y_pred = model.predict(X_test)

    # Compute residuals
    residuals = y_test - y_pred

    # Evaluate model performance
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

    # 1️⃣ Plot Actual vs. Predicted values
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, edgecolor="k")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Identity line
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title("Actual vs. Predicted Values")
    plt.grid(True)
    plt.show()

    # 2️⃣ Residuals Plot (Residuals vs. Predicted)
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_pred, y=residuals, alpha=0.6, edgecolor="k")
    plt.axhline(y=0, color='r', linestyle='--')  # Zero reference line
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residuals vs. Predicted Values")
    plt.grid(True)
    plt.show()

    # 3️⃣ Histogram of Residuals
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, bins=30, kde=True)
    plt.axvline(x=0, color='r', linestyle='--')  # Zero reference line
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title("Histogram of Residuals")
    plt.show()

def main():
    file_path = "cleaned_normalized_data.csv"  # Update this with the actual file path
    df = load_data(file_path)

    # Train and evaluate regression model
    train_regression_model(df)

if __name__ == "__main__":
    main()
