In [13]:
import pandas as pd

df = pd.read_csv('../data/preprocessed/preprocessed_data.csv')
X = df.drop(['price'],axis='columns')
y = df.price

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split, ShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib


# Define models and expanded hyperparameters for GridSearchCV
algos = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
            'normalize': [True, False]
        }
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 1, 10, 100],
            'selection': ['random', 'cyclic'],
            'max_iter': [1000, 5000]
        }
    },
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1, 10, 100],
            'fit_intercept': [True, False],
            'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'],
            'max_iter': [1000, 5000]
        }
    },
    'elasticnet': {
        'model': ElasticNet(),
        'params': {
            'alpha': [0.1, 1, 10],
            'l1_ratio': [0.1, 0.5, 0.9],
            'max_iter': [1000, 5000]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }
    }
}


In [None]:
# Train and evaluate each model using GridSearchCV
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = []
best_model = None
best_score = -np.inf

for algo_name, config in algos.items():
    gs = GridSearchCV(config['model'], config['params'], cv=cv, n_jobs=-1, return_train_score=False)
    gs.fit(X_train, y_train)
    
    y_pred = gs.best_estimator_.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(algo_name, 'has completed')
    scores.append({
        'model': algo_name,
        'test_mse': mse,
        'test_rmse': rmse,
        'test_mae': mae,
        'test_r2': r2,
        'best_params': gs.best_params_
    })
    
    # Keep track of the best model based on R² score
    if r2 > best_score:
        best_score = r2
        best_model = gs.best_estimator_


In [None]:
# Convert the results to a DataFrame and display them
results_df = pd.DataFrame(scores, columns=['model', 'test_mse', 'test_rmse', 'test_mae', 'test_r2', 'best_params'])
results_df.sort_values(by='test_r2', ascending=False, inplace=True)
print(results_df)


In [None]:
# Plot the results (MSE, RMSE, MAE, and R² for each model)
fig, ax = plt.subplots(2, 2, figsize=(15, 12))

# Plot Mean Squared Error
ax[0, 0].barh(results_df['model'], results_df['test_mse'], color='skyblue')
ax[0, 0].set_xlabel('Mean Squared Error')
ax[0, 0].set_title('Model Comparison - Mean Squared Error')

# Plot Root Mean Squared Error
ax[0, 1].barh(results_df['model'], results_df['test_rmse'], color='lightgreen')
ax[0, 1].set_xlabel('Root Mean Squared Error')
ax[0, 1].set_title('Model Comparison - Root Mean Squared Error')

# Plot Mean Absolute Error
ax[1, 0].barh(results_df['model'], results_df['test_mae'], color='lightcoral')
ax[1, 0].set_xlabel('Mean Absolute Error')
ax[1, 0].set_title('Model Comparison - Mean Absolute Error')

# Plot R² Score
ax[1, 1].barh(results_df['model'], results_df['test_r2'], color='orange')
ax[1, 1].set_xlabel('R² Score')
ax[1, 1].set_title('Model Comparison - R² Score')

plt.tight_layout()
plt.show()


In [None]:
# Save the best model
joblib.dump(best_model, 'best_model.pkl')
print(f"The best model is {best_model} with an R² score of {best_score:.4f}")