In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

In [130]:
# Part 1: 1.1
# Load the dataset
df_fuel = pd.read_csv('FuelEconomy.csv')

# Display column names
print("Columns:\n", df_fuel.columns)

# Display shape (rows, columns)
print("\nShape:", df_fuel.shape)

# Summary statistics
print("\nSummary Statistics:")
print(df_fuel.describe())

# Missing values
print("\nMissing Values:")
print(df_fuel.isna().sum())
df_fuel = df_fuel.dropna()
# Missing values are removed to ensure clean model training.

Columns:
 Index(['Horse Power', 'Fuel Economy (MPG)'], dtype='object')

Shape: (100, 2)

Summary Statistics:
       Horse Power  Fuel Economy (MPG)
count   100.000000          100.000000
mean    213.676190           23.178501
std      62.061726            4.701666
min      50.000000           10.000000
25%     174.996514           20.439516
50%     218.928402           23.143192
75%     251.706476           26.089933
max     350.000000           35.000000

Missing Values:
Horse Power           0
Fuel Economy (MPG)    0
dtype: int64


In [132]:
# Part 1: 1.2
def split_train_test(x, y, train_frac=0.7, split_mode="random", seed=42):
    """
    split_mode:
      - "random": shuffle then split
      - "ordered_contiguous": sort by x ascending (low->high), then split
    """
    x = np.asarray(x)
    y = np.asarray(y)

    if not (0.0 < train_frac < 1.0):
        raise ValueError("train_frac must be between 0 and 1.")

    test_size = 1.0 - train_frac

    if split_mode == "random":
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size, random_state=seed, shuffle=True
        )
        return x_train, x_test, y_train, y_test

    if split_mode == "ordered_contiguous":
        idx = np.argsort(x)
        x_sorted = x[idx]
        y_sorted = y[idx]

        n = len(x_sorted)
        n_train = int(np.floor(train_frac * n))

        return (x_sorted[:n_train], x_sorted[n_train:],
                y_sorted[:n_train], y_sorted[n_train:])

    raise ValueError("split_mode must be 'random' or 'ordered_contiguous'.")

In [134]:
# Part 1: 1.3
# a: Linear Regression
def fit_linear_regression(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = LinearRegression()
    model.fit(Xtr, y_train)
    return model

# b: Polynomial Regression d=2
def fit_polynomial_regression_degree2(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model
    
# c: Polynomial Regression d=3
def fit_polynomial_regression_degree3(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=3, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model
    
# d: Polynomial Regression d=4
def fit_polynomial_regression_degree4(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=4, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model

In [136]:
# Part 1: 1.4
def predict_model(model, x):
    X = x.reshape(-1, 1)
    return model.predict(X)

def compute_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }
    
def run_one_dataset(
    name,
    x, y,
    x_plot=None,
    train_frac=0.7,
    split_mode="random",
    seed=42
):
    x_train, x_test, y_train, y_test = split_train_test(
        x, y,
        train_frac=train_frac,
        split_mode=split_mode,
        seed=seed
    )
    
    models = {
        "Linear": fit_linear_regression(x_train, y_train),
        "Poly (deg=2)": fit_polynomial_regression_degree2(x_train, y_train),
        "Poly (deg=3)": fit_polynomial_regression_degree3(x_train, y_train),
        "Poly (deg=4)": fit_polynomial_regression_degree4(x_train, y_train),
    }
    results = {}

    for name_model, model in models.items():
        yhat_tr = predict_model(model, x_train)
        yhat_te = predict_model(model, x_test)

        tr_metrics = compute_metrics(y_train, yhat_tr)
        te_metrics = compute_metrics(y_test, yhat_te)

        results[name_model] = {
            "Train MSE": tr_metrics["MSE"],
            "Train MAE": tr_metrics["MAE"],
            "Train R2": tr_metrics["R2"],
            "Test MSE": te_metrics["MSE"],
            "Test MAE": te_metrics["MAE"],
            "Test R2": te_metrics["R2"],
        }
        
    results_df = pd.DataFrame(results).T

    print(f"\n=== {name} | split_mode={split_mode} | train={int(train_frac*100)}% ===")
    display(results_df.round(4))

    return results_df


In [138]:
# Add this import at the top with your other imports


# Then fix the main execution block:
if __name__ == "__main__":

    SPLIT_MODE = "random"
    TRAIN_FRAC = 0.7
    SEED = 42

    # Load dataset (already done earlier)
    df = df_fuel

    # Inspect columns if needed
    print(df.columns)

    # Choose feature + target - CORRECTED
    x = df["Fuel Economy (MPG)"].values  # Feature: Fuel Economy
    y = df["Horse Power"].values          # Target: Horsepower

    # Plot grid
    x_plot = np.linspace(x.min(), x.max(), 500)

    # Run Part 1
    results_df = run_one_dataset(
        name="Fuel Consumption → HP",
        x=x,
        y=y,
        x_plot=x_plot,
        train_frac=TRAIN_FRAC,
        split_mode=SPLIT_MODE,
        seed=SEED
    )

Index(['Horse Power', 'Fuel Economy (MPG)'], dtype='object')

=== Fuel Consumption → HP | split_mode=random | train=70% ===


Unnamed: 0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Linear,360.5279,16.1358,0.9041,315.0545,14.9156,0.9184
Poly (deg=2),353.9976,16.069,0.9058,326.0611,15.0723,0.9156
Poly (deg=3),348.6863,15.8482,0.9072,312.5292,14.6369,0.9191
Poly (deg=4),342.0722,15.5551,0.909,311.5464,14.7301,0.9193


In [140]:
# Part 1: 1.5
# 1. Polynomial Regression (degree 4) performed the best because it has the highest R^2 value and lowest test MSE error among the 4 models.

# 2. In this case, increasing the degree from 1 to 2 did not improve model performance indicated by a decreased R^2 value (-0.0658) and higher test
#    MSE error (+11.0066). Therefore higher polynomial degree does not always improve performance. 

# 3. All models performed well containing high R^2 and low test MSE & MAE error. No sign of overfitting due to similar test R^2 and train R^2.

In [142]:
# Part 2: 2.1
# Part 1: 1.1
# Load the dataset
df_consumption = pd.read_csv('electricity_consumption_based_weather_dataset.csv')

# 1. Display column names
print("Columns:\n", df_consumption.columns)

# 2. Display shape (rows, columns)
print("\nShape:", df_consumption.shape)

# 3. Summary statistics
print("\nSummary Statistics:")
print(df_consumption.describe())

# 4. Identify missing values
print("\nMissing Values:")
print(df_consumption.isna().sum())
df_consumption = df_consumption.dropna()
# Missing values are removed to ensure clean model training.

Columns:
 Index(['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption'], dtype='object')

Shape: (1433, 6)

Summary Statistics:
              AWND         PRCP         TMAX         TMIN  daily_consumption
count  1418.000000  1433.000000  1433.000000  1433.000000        1433.000000
mean      2.642313     3.800488    17.187509     9.141242        1561.078061
std       1.140021    10.973436    10.136415     9.028417         606.819667
min       0.000000     0.000000    -8.900000   -14.400000          14.218000
25%       1.800000     0.000000     8.900000     2.200000        1165.700000
50%       2.400000     0.000000    17.800000     9.400000        1542.650000
75%       3.300000     1.300000    26.100000    17.200000        1893.608000
max      10.200000   192.300000    39.400000    27.200000        4773.386000

Missing Values:
date                  0
AWND                 15
PRCP                  0
TMAX                  0
TMIN                  0
daily_consumption     0
dtype: int64

In [92]:
# Part 2: 2.2
def split_train_test(x, y, train_frac=0.7, split_mode="random", seed=42):
    """
    split_mode:
      - "random": shuffle then split
      - "ordered_contiguous": sort by x ascending (low->high), then split
    """
    x = np.asarray(x)
    y = np.asarray(y)

    if not (0.0 < train_frac < 1.0):
        raise ValueError("train_frac must be between 0 and 1.")

    test_size = 1.0 - train_frac

    if split_mode == "random":
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size, random_state=seed, shuffle=True
        )
        return x_train, x_test, y_train, y_test

    if split_mode == "ordered_contiguous":
        idx = np.argsort(x)
        x_sorted = x[idx]
        y_sorted = y[idx]

        n = len(x_sorted)
        n_train = int(np.floor(train_frac * n))

        return (x_sorted[:n_train], x_sorted[n_train:],
                y_sorted[:n_train], y_sorted[n_train:])

    raise ValueError("split_mode must be 'random' or 'ordered_contiguous'.")

In [94]:
# Part 2: 2.3
# a: Linear Regression
def fit_linear_regression(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = LinearRegression()
    model.fit(Xtr, y_train)
    return model

# b: Polynomial Regression d=2
def fit_polynomial_regression_degree2(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model
    
# c: Polynomial Regression d=3
def fit_polynomial_regression_degree3(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=3, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model
    
# d: Polynomial Regression d=4
def fit_polynomial_regression_degree4(x_train, y_train):
    Xtr = x_train.reshape(-1, 1)
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=4, include_bias=False)),
        ("lr", LinearRegression())
    ])
    model.fit(Xtr, y_train)
    return model

In [122]:
# Part 2: 2.4
def predict_model(model, x):
    X = x.reshape(-1, 1)
    return model.predict(X)

def compute_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }


def run_one_dataset(
    name,
    x, y,
    x_plot=None,
    train_frac=0.7,
    split_mode="random",
    seed=42
):
    x_train, x_test, y_train, y_test = split_train_test(
        x, y,
        train_frac=train_frac,
        split_mode=split_mode,
        seed=seed
    )

    models = {
        "Linear": fit_linear_regression(x_train, y_train),
        "Poly (deg=2)": fit_polynomial_regression_degree2(x_train, y_train),
        "Poly (deg=3)": fit_polynomial_regression_degree3(x_train, y_train),
        "Poly (deg=4)": fit_polynomial_regression_degree4(x_train, y_train),
    }

    results = {}

    for name_model, model in models.items():
        yhat_tr = predict_model(model, x_train)
        yhat_te = predict_model(model, x_test)

        tr_metrics = compute_metrics(y_train, yhat_tr)
        te_metrics = compute_metrics(y_test, yhat_te)

        results[name_model] = {
            "Train MSE": tr_metrics["MSE"],
            "Train MAE": tr_metrics["MAE"],
            "Train R2": tr_metrics["R2"],
            "Test MSE": te_metrics["MSE"],
            "Test MAE": te_metrics["MAE"],
            "Test R2": te_metrics["R2"],
        }

    results_df = pd.DataFrame(results).T

    print(f"\n=== {name} | split_mode={split_mode} | train={int(train_frac*100)}% ===")
    display(results_df.round(4))

    return results_df


In [124]:
# Part 2: Main execution for electricity consumption dataset
if __name__ == "__main__":

    SPLIT_MODE = "random"
    TRAIN_FRAC = 0.7
    SEED = 42

    # Load dataset (already done earlier in cell 90)
    df = df_consumption

    # Inspect columns if needed
    print(df.columns)

    # For Part 2, we need to use weather features to predict daily consumption
    # The dataset has: 'date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption'
    
    # For simplicity, let's use TMAX (max temperature) as the main feature
    # In a real analysis, you'd want to use multiple features
    x = df["TMAX"].values  # Feature: Maximum Temperature
    y = df["daily_consumption"].values  # Target: Daily Consumption

    # Plot grid
    x_plot = np.linspace(x.min(), x.max(), 500)

    # Run Part 2
    results_df = run_one_dataset(
        name="Weather → Daily Electricity Consumption",
        x=x,
        y=y,
        x_plot=x_plot,
        train_frac=TRAIN_FRAC,
        split_mode=SPLIT_MODE,
        seed=SEED
    )

Index(['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption'], dtype='object')

=== Weather → Daily Electricity Consumption | split_mode=random | train=70% ===


Unnamed: 0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Linear,276246.2333,391.0185,0.2658,248413.2013,378.4521,0.2985
Poly (deg=2),275864.8761,391.0182,0.2668,246341.4342,376.4363,0.3044
Poly (deg=3),274570.0462,389.1691,0.2702,244360.1345,375.1724,0.31
Poly (deg=4),273392.1375,389.4266,0.2734,243418.6547,373.7195,0.3126


In [None]:
# Part 2: 2.5

# 1. Polynomial regression degree 4 has the best generalized performance due to highest Test & Train R^2 among the 4 models. In addition Poly(deg=4)
#    has the lowest error in both MAE and MSE for test. However, the R^2 value = 0.3126 is relativley low and indicate a weak connection between weather
#    and electricity use.

# 2. In this case, higher degree models did tend to fit better but was not any significant difference thus can assume that higher degree didn't improve 
#    or worsen performance.

# 3. In this case, higher degree models did not perform worse evident by continuesly higher R^2 values indicating improved performance instead.
#    Additionally, no signs of overfitting as train error decreased together with test errors.

# 4. None of these models achieved a good test performance with two main reasons: low R^2 values and high variance. Comparing these models to part 1 models
#    which fit well and had good performance, the MAE error difference was about 300 and MSE error even more so. In addition a good model will have a high
#    R^2 value close to 1 but these models are only around 0.3.