In [1]:
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    ShuffleSplit
)
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor, 
    GradientBoostingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import r2_score

from sklearn.cluster import 

In [2]:
import pandas as pd
import numpy as np

data_df = pd.read_csv('artifacts/data.csv')

In [3]:
data_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,6.781058,4.867534,4.844187,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,8.86785,7.009409,7.037906,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,7.291656,5.252273,5.181784,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,7.150701,5.463832,5.393628,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,7.395108,5.638355,5.560682,3.8462,342200.0,0,0,1,0


In [4]:
X = data_df.drop(['median_house_value'],axis=1)
y = data_df['median_house_value']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16346, 11)
(16346,)
(4087, 11)
(4087,)


In [6]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'AdaBoost Regressor': AdaBoostRegressor(),
    # 'XGB Regressor': XGBRegressor(),
    # 'CatBoost Regressor': CatBoostRegressor(verbose=0)
}

In [7]:
params = {
    'Linear Regression': {
        'fit_intercept': [True, False]
    },
    'Ridge': {
        'alpha': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    },
    'Lasso': {
        'alpha': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'selection': ['cyclic', 'random']
    },
    'Decision Tree Regressor': {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'splitter': ['best', 'random']
    },
    'Random Forest Regressor': {
        # 'n_estimators': [100, 200, 300, 400, 500],
        # 'criterion': ['squared_error', 'absolute_error']
    },
    'Gradient Boosting Regressor': {
        # 'n_estimators': [100, 200, 300, 400, 500],
        # 'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']
    },
    'AdaBoost Regressor':{
        # 'n_estimators': [100, 200, 300, 400, 500],
        # 'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5]
    }

}

In [8]:
def evalute_model(X_train, X_test, y_train, y_test, models, params):
    score = []
    best_score = 0
    for model_name, model in models.items():
        print(f'Running {model_name} model...')
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
        gs = GridSearchCV(model, params[model_name], cv=cv)
        print(f'Fitting {model_name} model...')
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        # y_pred = gs.score(X_test, y_test)
        print(f'Evaluating {model_name} model...')
        test_model_score = r2_score(y_test, y_pred)
        score.append({
            'model': model_name,
            'score': test_model_score,
            'best params': gs.best_params_
        })
        if test_model_score > best_score:
            best_score = test_model_score
        print(f'{model_name} model completed...')
        model_df = pd.DataFrame(score)
    return model_df

In [9]:
# best_model_df = evalute_model(X_train, X_test, y_train, y_test, models, params)

In [10]:
# best_model_df

In [11]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

In [12]:
import os
import pickle

model_trainer_path = os.path.join('artifacts', 'model.pkl')

with open(model_trainer_path, 'wb') as file_obj:
    pickle.dump(forest, file_obj)

In [13]:
model = pickle.load(open('artifacts/model.pkl', 'rb'))
model

In [14]:
# import json

# json_file_path = os.path.join('artifacts', 'columns.json')

# columns = {
#     'data columns': [col.lower() for col in X.columns]
# }

# with open(json_file_path, 'w') as file_obj:
#     file_obj.write(json.dumps(columns))

In [15]:
y_pred = forest.predict(X_test)

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

MSE:  2424513938.264717
MAE:  31987.954798140447
R2:  0.8227069918711286
