In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import pickle
import json

import warnings
warnings.filterwarnings("ignore")

In [11]:
df = pd.read_csv("C:\\Users\\Rajch\\Desktop\\Hackathon\\data\\processed\\processed_data.csv")

In [12]:
X = df.drop(columns="Recycling Rate (%)")
y = df["Recycling Rate (%)"]

print(f"Shape of features: {X.shape}\nShape of the target: {y.shape}")

Shape of features: (850, 49)
Shape of the target: (850,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
models = {"Linear Regression": LinearRegression(),
          "Random Forest Regressor": RandomForestRegressor(n_estimators=200, random_state=42),
          "xgboost" : XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
          "lightgbm": LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42)}

In [15]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = rmse
    print(f"{name} RMSE = {rmse}")

Linear Regression RMSE = 17.568630103528182
Random Forest Regressor RMSE = 17.6317976711862
xgboost RMSE = 19.155670107454476
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 599
[LightGBM] [Info] Number of data points in the train set: 595, number of used features: 23
[LightGBM] [Info] Start training from score 57.752941
lightgbm RMSE = 19.00144468344413


In [16]:
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print(f"The best model is {best_model_name} with RMSE {results[best_model_name]}")

The best model is Linear Regression with RMSE 17.568630103528182


In [17]:
with open("C:\\Users\\Rajch\\Desktop\\Hackathon\\models\\model.pkl", "wb") as file:
    pickle.dump(best_model, file)

In [18]:
y_pred_final = np.round(best_model.predict(X_test), 2)

prediction_df = pd.DataFrame({"Actual" : y_test,
                              "Predicted": y_pred_final})

prediction_df.to_csv("C:\\Users\\Rajch\\Desktop\\Hackathon\\Predictions\\predictions.csv", index=False)

In [19]:
with open("C:\\Users\\Rajch\\Desktop\\Hackathon\\json\\results.json", "w") as file:
    json.dump(results, file)