In [1]:
import os
import pickle
import time
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool

In [2]:
def save_object(file_path, obj):
    dir_path = os.path.dirname(file_path)

    os.makedirs(dir_path, exist_ok=True)

    with open(file_path, "wb") as file_obj:
        pickle.dump(obj, file_obj)

In [3]:
def evaluate_models(X_train, y_train,X_test,y_test,models,param):
        report = {}

        for i in range(len(list(models))):
            model = list(models.values())[i]
            para=param[list(models.keys())[i]]

            gs = GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train,y_train)

            #model.fit(X_train, y_train)  # Train model

            y_train_pred = model.predict(X_train)

            y_test_pred = model.predict(X_test)

            train_model_score = r2_score(y_train, y_train_pred)

            test_model_score = r2_score(y_test, y_test_pred)

            report[list(models.keys())[i]] = test_model_score

        return report

In [4]:
train_arr = pd.read_csv("train_arr.csv")
test_arr = pd.read_csv("test_arr.csv")

In [6]:
import os
import pickle
import time
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool

# Save object utility
def save_object(file_path, obj):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "wb") as file_obj:
        pickle.dump(obj, file_obj)

# Evaluate models function
def evaluate_models(X_train, y_train, X_test, y_test, models, param):
    report = {}
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        start_time = time.time()

        # Apply GridSearchCV if hyperparameters are provided
        if param.get(model_name):
            gs = GridSearchCV(model, param[model_name], cv=3, n_jobs=-1, verbose=1)
            gs.fit(X_train, y_train)
            models[model_name] = gs.best_estimator_  # Update the dictionary with the tuned model
            model = gs.best_estimator_
        else:
            model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_model_score = r2_score(y_train, y_train_pred)
        test_model_score = r2_score(y_test, y_test_pred)

        report[model_name] = test_model_score
        print(f"{model_name} completed in {time.time() - start_time:.2f} seconds.")

    return report

# Load datasets
train_arr = pd.read_csv("train_arr.csv")
test_arr = pd.read_csv("test_arr.csv")

X_train, y_train = train_arr.iloc[:, :-1], train_arr.iloc[:, -1]
X_test, y_test = test_arr.iloc[:, :-1], test_arr.iloc[:, -1]

# Define models and hyperparameters
models = {
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Linear Regression": LinearRegression(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}

params = {
    "Decision Tree": {
        'criterion': ['squared_error', 'friedman_mse'],
    },
    "Random Forest": {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, None],
    },
    "Gradient Boosting": {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [50, 100],
    },
    "XGBRegressor": {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [50, 100],
    },
    "CatBoosting Regressor": {
        'depth': [6, 8],
        'iterations': [50, 100],
    },
    "AdaBoost Regressor": {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [50, 100],
    },
}

# Evaluate models
model_report = evaluate_models(X_train, y_train, X_test, y_test, models, params)

# Get the best model
best_model_name = max(model_report, key=model_report.get)
best_model_score = model_report[best_model_name]
best_model = models[best_model_name]

print(f"Best Model: {best_model_name} with R2 Score: {best_model_score}")

# Ensure the best model is fitted
if not hasattr(best_model, "predict"):
    raise ValueError(f"The model {best_model_name} is not properly initialized or fitted.")
if hasattr(best_model, "is_fitted") and not best_model.is_fitted():
    print(f"{best_model_name} was not properly trained. Training now...")
    if isinstance(best_model, CatBoostRegressor):
        train_pool = Pool(data=X_train, label=y_train)
        best_model.fit(train_pool)
    else:
        best_model.fit(X_train, y_train)

# Predictions and final R2 score
if isinstance(best_model, CatBoostRegressor):
    test_pool = Pool(data=X_test, label=y_test)
    predicted = best_model.predict(test_pool)
else:
    predicted = best_model.predict(X_test)

r2_square = r2_score(y_test, predicted)
print(f"Final R2 Score of Best Model: {r2_square}")

# Save the best model
save_object(
    file_path="artifacts/best_model.pkl",
    obj=best_model
)

Training Random Forest...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Random Forest completed in 41.12 seconds.
Training Decision Tree...
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Decision Tree completed in 0.64 seconds.
Training Gradient Boosting...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Gradient Boosting completed in 11.99 seconds.
Training Linear Regression...
Linear Regression completed in 0.02 seconds.
Training XGBRegressor...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
XGBRegressor completed in 0.92 seconds.
Training CatBoosting Regressor...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
CatBoosting Regressor completed in 3.57 seconds.
Training AdaBoost Regressor...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
AdaBoost Regressor completed in 14.05 seconds.
Best Model: Random Forest with R2 Score: 0.9060704845617257
Final R2 Score of Best Model: 0.9060704845617257
