In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_csv('cleaned.csv')

In [3]:
X = df.drop(columns=['primary_energy_consumption'])
y = df['primary_energy_consumption']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [6]:
param_grid = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "KNN": {
        "model": KNeighborsRegressor(),
        "params": {"model__n_neighbors": [3, 5, 7]}
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {"model__max_depth": [5, 10]}
    },
    "Random Forest": {
        "model": RandomForestRegressor(),
        "params": {"model__n_estimators": [10, 50], "model__max_depth": [5, 10]}
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(),
        "params": {"model__n_estimators": [10, 50], "model__learning_rate": [0.05, 0.1]}
    },
    "SVR": {
    "model": SVR(),
    "params": {
        "model__C": [0.1, 1, 10], 
        "model__kernel": ["linear", "rbf"]
        }
    }
}

In [7]:
best_models = {}
for name, config in param_grid.items():
    print(f"Training {name}...")

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', config['model'])
    ])

    search = RandomizedSearchCV(
        pipeline, config['params'], n_iter=3, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, random_state=42
    )

    search.fit(X_train, y_train)
    best_models[name] = search.best_estimator_
    
    r2_score = cross_val_score(search.best_estimator_, X_train, y_train, scoring='r2', cv=3).mean()

    print(f"Best Params for {name}: {search.best_params_}")
    print(f"R² Score: {r2_score:.4f}\n")

Training Linear Regression...




Best Params for Linear Regression: {}
R² Score: -827.5355

Training KNN...
Best Params for KNN: {'model__n_neighbors': 3}
R² Score: 0.9088

Training Decision Tree...




Best Params for Decision Tree: {'model__max_depth': 10}
R² Score: 0.8718

Training Random Forest...
Best Params for Random Forest: {'model__n_estimators': 50, 'model__max_depth': 10}
R² Score: 0.9228

Training Gradient Boosting...
Best Params for Gradient Boosting: {'model__n_estimators': 50, 'model__learning_rate': 0.1}
R² Score: 0.8404

Training SVR...
Best Params for SVR: {'model__kernel': 'rbf', 'model__C': 10}
R² Score: 0.7422



In [14]:
for name, model in best_models.items():
    test_score = model.score(X_test, y_test)
    print(f"{name} Test R² Score: {test_score:.4f}")

Linear Regression Test R² Score: 0.2533
KNN Test R² Score: 0.9217
Decision Tree Test R² Score: 0.9069
Random Forest Test R² Score: 0.9402
Gradient Boosting Test R² Score: 0.8508
SVR Test R² Score: 0.8007


In [10]:
import pickle

In [12]:
for name, model in best_models.items():
    filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)