In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(df, target_col, features_cols):
    X = df[features_cols]
    y = df[target_col]
    
    # Split into Training & Testing Sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model using MSE, RMSE, and R2 score
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'R2 Score: {r2}')
    
    return model, y_test, y_pred

def plot_actual_and_predicted(y_test, y_pred):
    # Plotting actual and predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.7)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--') 
    plt.title('Actual vs Predicted Electricity Demand')
    plt.xlabel('Actual Demand')
    plt.ylabel('Predicted Demand')
    plt.show()

def residual_analysis(y_test, y_pred):
    # Calculate residuals
    residuals = y_test - y_pred
    
    # Plotting residuals
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred, residuals, color='blue', alpha=0.7)
    plt.axhline(y=0, color='red', linestyle='--')  # Horizontal line at 0
    plt.title('Residuals vs Predicted Values')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.show()

def perform_regression_model(df):
    timestamp_col = "timestamp"
    target_col = "demand"

    feature_cols = [ "temperature", "year", "month", "day", "hour", "day_of_week", "is_weekend"]

    model, y_test, y_pred = evaluate_model(df, target_col, feature_cols)

    plot_actual_and_predicted(y_test, y_pred)

    residual_analysis(y_test, y_pred)

    return model, y_pred

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore


# IQR-based Outlier Detection
def detect_outliers_iqr(df):
    outliers = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
    return outliers

# Z-score-based Outlier Detection
def detect_outliers_zscore(df, threshold=3):
    outliers = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        z_scores = zscore(df[col])
        outliers[col] = df[np.abs(z_scores) > threshold].index
    return outliers

def plot_data_before_after(df, df_cleaned_iqr, df_cleaned_zscore):
    # Plot original data
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 3, 1)
    sns.boxplot(data=df)
    plt.title('Original Data')

    # After modification (cleaned data)
    plt.subplot(1, 3, 2)
    sns.boxplot(data=df_cleaned_iqr)
    plt.title('After Handling IQR Outliers')

    plt.subplot(1, 3, 3)
    sns.boxplot(data=df_cleaned_zscore)
    plt.title('After Handling Z-Score Outliers')

    plt.show()

def handle_outliers(df, outliers, strategy="remove", cap_value=None):
    df_copy = df.copy()
    for col, indices in outliers.items():
        if strategy == "remove":
            df_copy = df_copy.drop(index=indices, errors='ignore')
        elif strategy == "cap":
            lower_limit = df_copy[col].quantile(0.05) if cap_value is None else cap_value
            upper_limit = df_copy[col].quantile(0.95) if cap_value is None else cap_value
            df_copy[col] = np.clip(df_copy[col], lower_limit, upper_limit)
        elif strategy == "transform":
            df_copy[col] = np.log1p(df_copy[col])  # Log transformation
    return df_copy

def detect_outliers(df):
    outliers_iqr = detect_outliers_iqr(df)
    outliers_zscore = detect_outliers_zscore(df)

    clenaed_df = df_iqr_removed = handle_outliers(df, outliers_iqr, strategy="remove")
    clenaed_df = df_zscore_transformed = handle_outliers(df, outliers_zscore, strategy="transform")

    plot_data_before_after(df, df_iqr_removed, df_zscore_transformed)
    return clenaed_df

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

def perform_stats_analysis(df):
    """Compute basic statistics (mean, median, std, etc.) for each numerical column."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns  
    stats_df = pd.DataFrame(columns=["Mean", "Median", "Standard Deviation", "Variance", 
                                     "Skewness", "Kurtosis", "Min", "Max"])

    for col in numeric_cols:
        stats_df.loc[col] = {
            "Mean": df[col].mean(),
            "Median": df[col].median(),
            "Standard Deviation": df[col].std(),
            "Variance": df[col].var(),
            "Skewness": df[col].skew(),
            "Kurtosis": stats.kurtosis(df[col], fisher=True),
            "Min": df[col].min(),
            "Max": df[col].max()
        }
    
    print("\nStatistical Summary of Numeric Features")
    print(stats_df)

def plot_time_series(df):
    """Plot electricity demand over time with a line plot."""

    timestamp_col = "timestamp"
    demand_col = "demand"
    df.loc[:, timestamp_col] = pd.to_datetime(df[timestamp_col])
    df.sort_values(by=timestamp_col)

    plt.figure(figsize=(12, 6))
    plt.plot(df[timestamp_col], df[demand_col], label="Electricity Demand", color='blue')
    plt.title("Electricity Demand Over Time", fontsize=14)
    plt.xlabel("Time", fontsize=12)
    plt.ylabel("Electricity Demand", fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.show()

def perform_univariate_analysis(df):
    """Generate histograms, boxplots, and density plots for numerical features."""

    df.loc[:, "timestamp"] = pd.to_datetime(df["timestamp"])
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    for column in numeric_cols:
        plt.figure(figsize=(14, 6))

        plt.subplot(1, 3, 1)
        sns.histplot(df[column], kde=True)
        plt.title(f'Histogram of {column}')

        plt.subplot(1, 3, 2)
        sns.boxplot(y=df[column])
        plt.title(f"Boxplot of {column}")

        plt.subplot(1, 3, 3)
        sns.kdeplot(df[column], fill=True)
        plt.title(f"Density Plot of {column}")

        plt.show()

def perform_correlation_analysis(df):
    """Compute and visualize correlation matrix for numerical features."""

    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()

def perform_advance_time_series_analysis(df):
    """Decompose the time series and perform a stationarity test."""

    timestamp_col = "timestamp"
    demand_col = "demand"
    df.loc[:, timestamp_col] = pd.to_datetime(df[timestamp_col])
    df.set_index(timestamp_col, inplace=True)

    decomposition = seasonal_decompose(df[demand_col], model='additive', period=365)
    decomposition.plot()
    plt.show()
    perform_stationarity_test(df, timestamp_col, demand_col)

def perform_stationarity_test(df, timestamp_col, demand_col):
    """Conduct Augmented Dickey-Fuller test to check for stationarity."""
    
    result = adfuller(df[demand_col].dropna())
    print("\nAugmented Dickey-Fuller Test Results")
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print(f"Critical Values: {result[4]}")
    
    if result[1] < 0.05:
        print("The time series is stationary (rejecting null hypothesis).")
    else:
        print("The time series is non-stationary (fail to reject null hypothesis).")
    df.reset_index(inplace=True)

def perform_eda(df):

    perform_stats_analysis(df)
    plot_time_series(df)
    perform_univariate_analysis(df)
    perform_correlation_analysis(df)
    perform_advance_time_series_analysis(df)


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def test_mcar(df):
    df_miss = df.isnull().astype(int)
    chi2, p, _, _ = chi2_contingency(df_miss.corr())
    return p 

def missing_data_consistencies(df):
    # Calculate missing counts and percentages
    missing_counts = df.isnull().sum()
    missing_percentage = (missing_counts / len(df)) * 100

    missing_df = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing Percentage': missing_percentage
    })
    
    print("Missing Value Summary:")
    print(missing_df)

    #Determine missing value type
    p_missing_value = test_mcar(df)
    if p_missing_value > 0.05:
        print("Missing data is most likely MCAR")
    else:
        print("Missing data is likely MAR or MNAR")  

    #perform imputation
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':  # string type values or other objects
                df[col].fillna(df[col].mode()[0], inplace=True) #mode
            else: 
                df[col].fillna(df[col].median(), inplace=True)  #median

    print("Missing values handled") 

    return df 

def duplicate_and_inconsistencies(df):
    df = df.drop_duplicates() #remove duplicates

    for col in df.select_dtypes(include=['number']):
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        
        print(f"Outliers detected in {col}: {len(outliers)} rows")
    
    return df

def engineer_feature(df):
    if 'timestamp' in df.columns:
        df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

        df.loc[:, 'year'] = df['timestamp'].dt.year
        df.loc[:, 'month'] = df['timestamp'].dt.month
        df.loc[:, 'day'] = df['timestamp'].dt.day
        df.loc[:, 'hour'] = df['timestamp'].dt.hour
        df.loc[:, 'day_of_week'] = df['timestamp'].dt.dayofweek
        df.loc[:, 'is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

        def get_season(month):
            if month in [12, 1, 2]:  
                return "Winter"
            elif month in [3, 4, 5]:  
                return "Spring"
            elif month in [6, 7, 8]:  
                return "Summer"
            else:  
                return "Autumn"

        df.loc[:, 'season'] = df['month'].apply(get_season)

    return df


def normalize_standardize(df, method="normalize"):
    numeric_cols = df.select_dtypes(include=['number']).columns
    if method == "normalize":
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

def pre_process_data(df):
    #data_conversions
    df["demand"] = pd.to_numeric(df["demand"], errors='coerce')
    df.sort_values(by="timestamp", inplace=True)

    df = missing_data_consistencies(df)
    df = duplicate_and_inconsistencies(df)
    df = engineer_feature(df)
    df = normalize_standardize(df, method="normalize") # Change to "standardize" if needed

    return df


In [None]:
import os
import pandas as pd
import json
import process_data
import eda
import outliers
import regression_model

# File Paths
weather_dir = r".\data\raw\weather_raw_data"
electricity_dir = r".\data\raw\electricity_raw_data"

def load_weather_data(folder):
    weather_data = []
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(folder, file))
            df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)
            weather_data.append(df)
    weather_df = pd.concat(weather_data, ignore_index=True)
    weather_df = weather_df.rename(columns={"date": "timestamp", "temperature_2m": "temperature"})
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"], errors='coerce', utc=True)
    return weather_df

def load_electricity_data(folder):
    electricity_data = []
    for file in os.listdir(folder):
        if file.endswith(".json"):
            with open(os.path.join(folder, file), 'r') as f:
                try:
                    data = json.load(f)["response"]["data"]
                    electricity_data.extend(data)
                except KeyError:
                    print(f"Unexpected error, Skipping file")
    electricity_df = pd.DataFrame(electricity_data)
    electricity_df["period"] = pd.to_datetime(electricity_df["period"], errors='coerce')
    electricity_df["value"] = pd.to_numeric(electricity_df["value"], errors='coerce')
    electricity_df.rename(columns={"period": "timestamp", "value": "demand"}, inplace=True)
    electricity_df["timestamp"] = pd.to_datetime(electricity_df["timestamp"], errors='coerce', utc=True)
    return electricity_df

if __name__ == "__main__":
    weather_df = load_weather_data(weather_dir)
    electricity_df = load_electricity_data(electricity_dir)

    merged_data_df = pd.merge(electricity_df, weather_df, on="timestamp", how="inner")
    merged_data_df.sort_values(by="timestamp", inplace=True)
    merged_data_df.to_csv("merged_data.csv", index=False)

    processed_data_df = process_data.pre_process_data(merged_data_df)
    processed_data_df.to_csv("processed_data.csv", index=False)

    eda.perform_eda(processed_data_df)

    clean_df = outliers.detect_outliers(processed_data_df)
    clean_df.to_csv("cleaned_data.csv", index=False)

    model, predictions = regression_model.perform_regression_model(clean_df)