In [None]:
# Importing the Libarary
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# ML regressors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

In [None]:
# Load your uploaded dataset
df = pd.read_csv("/content/owid-co2-data.csv")

In [None]:
df = df[df["country"] == "Germany"].reset_index(drop=True)
df = df.dropna(subset=["co2"])

In [None]:
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Germany,1792,DEU,,,,,0.469,,,...,,,,,,,,,,
1,Germany,1793,DEU,,,,,0.48,0.011,2.344,...,,,,,,,,,,
2,Germany,1794,DEU,,,,,0.443,-0.037,-7.634,...,,,,,,,,,,
3,Germany,1795,DEU,,,,,0.447,0.004,0.826,...,,,,,,,,,,
4,Germany,1796,DEU,,,,,0.535,0.088,19.672,...,,,,,,,,,,


In [None]:
df.tail()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
227,Germany,2019,DEU,83559185.0,3886390000000.0,13.287,0.159,709.827,-50.22,-6.607,...,2.79,2.998,0.004,0.041,0.047,0.002,793.775,753.911,131.596,18.539
228,Germany,2020,DEU,83628711.0,3742720000000.0,13.357,0.16,648.357,-61.47,-8.66,...,2.725,2.969,0.004,0.041,0.047,0.002,729.728,691.065,122.566,18.904
229,Germany,2021,DEU,83697082.0,3841040000000.0,13.64,0.163,678.777,30.42,4.692,...,2.792,2.94,0.004,0.041,0.048,0.002,759.036,722.425,149.604,22.04
230,Germany,2022,DEU,84086228.0,3909610000000.0,12.538,0.149,671.472,-7.306,-1.076,...,2.734,2.911,0.004,0.042,0.048,0.002,747.968,713.675,161.23,24.012
231,Germany,2023,DEU,84548234.0,,10.604,0.125,596.151,-75.32,-11.217,...,2.712,2.881,0.004,0.042,0.048,0.002,670.626,637.367,,


In [None]:
# Train = 1975-2000, Test = 2001
train = df[(df["year"] >= 1975) & (df["year"] <= 2000)]
test = df[df["year"] == 2001]

X_train = train[["year"]].values
y_train = train["co2"].values
X_test = test[["year"]].values
y_test = test["co2"].values

# Scale features (needed for SVR, KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# Define Models
# -------------------------
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001, max_iter=10000),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=500, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=500, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=3)
}

# -------------------------
# Error Function
# -------------------------
def error_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # fix for older sklearn
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape

# -------------------------
# Train & Evaluate
# -------------------------
metrics = {}
for name, model in models.items():
    if name in ["SVR", "KNN"]:
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

    mae, rmse, mape = error_metrics(y_test, pred)
    metrics[name] = {
        "Pred_2001": pred[0],
        "Actual_2001": y_test[0],
        "MAE": mae, "RMSE": rmse, "MAPE": mape
    }

# -------------------------
# Results
# -------------------------
error_df = pd.DataFrame(metrics).T.sort_values("RMSE")
print("=== Predictions & Errors (2001) ===")
print(error_df)

best_model = error_df.iloc[0]
print("\n✅ Best Model (Lowest RMSE):")
print(best_model)

=== Predictions & Errors (2001) ===
                   Pred_2001  Actual_2001        MAE       RMSE      MAPE
Ridge             914.617361      915.255   0.637639   0.637639  0.069668
Lasso             914.553782      915.255   0.701218   0.701218  0.076615
LinearRegression  914.553542      915.255   0.701458   0.701458  0.076641
KNN               905.972333      915.255   9.282667   9.282667  1.014216
RandomForest      901.352392      915.255  13.902608  13.902608  1.518987
SVR               899.111884      915.255  16.143116  16.143116  1.763783
XGBoost           898.976196      915.255  16.278804  16.278804  1.778609
DecisionTree      898.976000      915.255  16.279000  16.279000  1.778630
GradientBoosting  898.975976      915.255  16.279024  16.279024  1.778633

✅ Best Model (Lowest RMSE):
Pred_2001      914.617361
Actual_2001    915.255000
MAE              0.637639
RMSE             0.637639
MAPE             0.069668
Name: Ridge, dtype: float64


In [None]:
df = df.dropna(subset=["co2"])   # remove missing target values

# Train = 1998-2022, Test = 2023
train = df[(df["year"] >= 1998) & (df["year"] <= 2022)]
test = df[df["year"] == 2023]

X_train = train[["year"]].values
y_train = train["co2"].values
X_test = test[["year"]].values
y_test = test["co2"].values

# Scale features (helps SVR, KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# Define Models
# -------------------------
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001, max_iter=10000),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=500, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=500, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=3)
}

# -------------------------
# Error Metrics
# -------------------------
def error_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # manual RMSE
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape

# -------------------------
# Train & Evaluate
# -------------------------
metrics = {}
for name, model in models.items():
    if name in ["SVR", "KNN"]:  # need scaled input
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

    mae, rmse, mape = error_metrics(y_test, pred)
    metrics[name] = {
        "Pred_2023": pred[0],
        "Actual_2023": y_test[0],
        "MAE": mae, "RMSE": rmse, "MAPE": mape
    }

# -------------------------
# Results
# -------------------------
error_df = pd.DataFrame(metrics).T.sort_values("RMSE")
print("=== Predictions & Errors (2023) ===")
print(error_df)

best_model = error_df.iloc[0]
print("\n✅ Best Model (Lowest RMSE):")
print(best_model)

=== Predictions & Errors (2023) ===
                   Pred_2023  Actual_2023         MAE        RMSE       MAPE
KNN               666.202000      596.151   70.051000   70.051000  11.750546
DecisionTree      671.472000      596.151   75.321000   75.321000  12.634551
GradientBoosting  671.472110      596.151   75.321110   75.321110  12.634569
XGBoost           671.474243      596.151   75.323243   75.323243  12.634927
RandomForest      672.820260      596.151   76.669260   76.669260  12.860711
LinearRegression  689.485470      596.151   93.334470   93.334470  15.656179
Lasso             689.485720      596.151   93.334720   93.334720  15.656221
Ridge             689.584916      596.151   93.433916   93.433916  15.672861
SVR               728.599599      596.151  132.448599  132.448599  22.217290

✅ Best Model (Lowest RMSE):
Pred_2023      666.202000
Actual_2023    596.151000
MAE             70.051000
RMSE            70.051000
MAPE            11.750546
Name: KNN, dtype: float64
