# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> **Model Training** for  **KING COUNTY HOUSE SALES DATASETS**</p>

In [23]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor


# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Read the dataset</p>

In [24]:
df = pd.read_csv('./house_data.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
id,7129300520,6414100192,5631500400,2487200875,1954400510
date,20141013T000000,20141209T000000,20150225T000000,20141209T000000,20150218T000000
price,221900.0,538000.0,180000.0,604000.0,510000.0
bedrooms,3,3,2,4,3
bathrooms,1.0,2.25,1.0,3.0,2.0
sqft_living,1180,2570,770,1960,1680
sqft_lot,5650,7242,10000,5000,8080
floors,1.0,2.0,1.0,1.0,1.0
waterfront,0,0,0,0,0
view,0,0,0,0,0


### Extracting the Year from the 'date' Column and converting to integer


In [25]:
df['date'] = df['date'].str[:4].astype(int)

### Dropping columns id  and zipcode

In [26]:
# drop 'id' and 'zipcode' column
df.drop(['id','zipcode'], axis=1, inplace=True)

# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Getting X and Y Variables</p>

In [27]:
X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Creating DataTransformation Pipelines</p>

In [28]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
            
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

# Numerical Pipeline
num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

# Categorical Pipeline
cat_pipeline = Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder',OrdinalEncoder()),
                ('scaler',StandardScaler())
                ]
            )

preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,numerical_cols),
                ('cat_pipeline',cat_pipeline,categorical_cols)
                ]
            )

# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Train Test Split</p>

In [29]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.2,random_state=42)

# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Transforming the data with pipeline created</p>

In [32]:
xtrain = pd.DataFrame(preprocessor.fit_transform(xtrain),columns=preprocessor.get_feature_names_out())
xtest = pd.DataFrame(preprocessor.transform(xtest),columns=preprocessor.get_feature_names_out())


In [33]:
preprocessor.get_feature_names_out()

array(['num_pipeline__date', 'num_pipeline__bedrooms',
       'num_pipeline__bathrooms', 'num_pipeline__sqft_living',
       'num_pipeline__sqft_lot', 'num_pipeline__floors',
       'num_pipeline__waterfront', 'num_pipeline__view',
       'num_pipeline__condition', 'num_pipeline__grade',
       'num_pipeline__sqft_above', 'num_pipeline__sqft_basement',
       'num_pipeline__yr_built', 'num_pipeline__yr_renovated',
       'num_pipeline__lat', 'num_pipeline__long',
       'num_pipeline__sqft_living15', 'num_pipeline__sqft_lot15'],
      dtype=object)

In [34]:
xtrain.head().T

Unnamed: 0,0,1,2,3,4
num_pipeline__date,-0.68884,1.451715,1.451715,1.451715,1.451715
num_pipeline__bedrooms,-0.395263,-1.468964,-0.395263,-0.395263,-1.468964
num_pipeline__bathrooms,-0.474451,-1.452583,-1.452583,0.177636,0.50368
num_pipeline__sqft_living,-0.323933,-1.183653,-1.095477,0.017751,-0.366919
num_pipeline__sqft_lot,-0.043873,-0.285775,-0.188293,-0.187933,-0.343991
num_pipeline__floors,-0.9196,-0.9196,0.001545,-0.9196,0.92269
num_pipeline__waterfront,-0.084992,-0.084992,-0.084992,-0.084992,-0.084992
num_pipeline__view,-0.305917,-0.305917,-0.305917,-0.305917,-0.305917
num_pipeline__condition,0.909073,-0.625426,-0.625426,0.909073,-0.625426
num_pipeline__grade,1.150243,-1.413156,-1.413156,-0.55869,0.295777


In [36]:
xtest.head().T

Unnamed: 0,0,1,2,3,4
num_pipeline__date,-0.68884,-0.68884,-0.68884,1.451715,-0.68884
num_pipeline__bedrooms,0.678437,1.752138,0.678437,-0.395263,-0.395263
num_pipeline__bathrooms,0.177636,1.155768,0.50368,1.807856,0.50368
num_pipeline__sqft_living,-0.004293,0.910538,1.869457,2.740199,0.524766
num_pipeline__sqft_lot,-0.152066,-0.207758,-0.10057,-0.004916,-0.242621
num_pipeline__floors,0.92269,-0.9196,0.92269,0.92269,0.92269
num_pipeline__waterfront,-0.084992,-0.084992,-0.084992,-0.084992,-0.084992
num_pipeline__view,-0.305917,-0.305917,2.319696,2.319696,-0.305917
num_pipeline__condition,0.909073,2.443571,-0.625426,-0.625426,-0.625426
num_pipeline__grade,0.295777,0.295777,2.859176,3.713643,1.150243


# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> Model Training with Diverse Models</p>

### Create an Evaluate Function to give all metrics after model Training

In [37]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

### Training Various models

In [40]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(xtrain, ytrain.values.flatten()) # Train model

    # Make predictions
    y_train_pred = model.predict(xtrain)
    y_test_pred = model.predict(xtest)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(ytrain, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(ytest, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 199090.9324
- Mean Absolute Error: 125065.6596
- R2 Score: 0.6966
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 213977.9188
- Mean Absolute Error: 127578.5679
- R2 Score: 0.6971


Lasso
Model performance for Training set
- Root Mean Squared Error: 199090.9325
- Mean Absolute Error: 125065.7027
- R2 Score: 0.6966
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 213977.9513
- Mean Absolute Error: 127578.6363
- R2 Score: 0.6971


Ridge
Model performance for Training set
- Root Mean Squared Error: 199090.9329
- Mean Absolute Error: 125063.7832
- R2 Score: 0.6966
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 213978.3613
- Mean Absolute Error: 127576.8436
- R2 Score: 0.6971




  model = cd_fast.enet_coordinate_descent(


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 135471.8765
- Mean Absolute Error: 74601.0932
- R2 Score: 0.8595
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 181709.8005
- Mean Absolute Error: 97672.3788
- R2 Score: 0.7816


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 3432.9674
- Mean Absolute Error: 146.1350
- R2 Score: 0.9999
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 212063.0370
- Mean Absolute Error: 105744.3102
- R2 Score: 0.7025


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 48264.4176
- Mean Absolute Error: 25779.6494
- R2 Score: 0.9822
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 141967.4273
- Mean Absolute Error: 72286.4050
- R2 Score: 0.8667


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error:

### Results from various models

In [41]:
df_results = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
df_results

Unnamed: 0,Model Name,R2_Score
6,CatBoosting Regressor,0.90488
5,Random Forest Regressor,0.866681
3,K-Neighbors Regressor,0.78159
4,Decision Tree,0.702529
0,Linear Regression,0.697132
1,Lasso,0.697132
2,Ridge,0.697131
7,AdaBoost Regressor,0.195866


# <p style="padding:10px; background-color:black; margin:10; color:green; font-family:'New Times Roman'; font-size:100%; text-align:center;border-radius: 10px 10px; overflow:hidden; font-weight:50"> HyperParamaeter Tuning</p>

In [42]:
# Initializing catboost
cbr = CatBoostRegressor(verbose=False)

# Creating the hyperparameter grid
param_dist = {'depth'          : [4,5,6,7,8,9, 10],
              'learning_rate' : [0.01,0.02,0.03,0.04],
               'iterations'    : [300,400,500,600]}

#Instantiate RandomSearchCV object
rscv = RandomizedSearchCV(cbr , param_dist, scoring='r2', cv =5, n_jobs=-1)

# Fit the model
rscv.fit(xtrain, ytrain.values.flatten())

# Print the tuned parameters and score
print(rscv.best_params_)
print(rscv.best_score_)

{'learning_rate': 0.04, 'iterations': 400, 'depth': 7}
0.8944546248293171


### Definition to print evaluated model results

In [43]:
def print_evaluated_results(model,xtrain,ytrain,xtest,ytest):
    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(ytrain, ytrain_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(ytest, ytest_pred)

    # Printing results
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

In [44]:
# Selecting best model
best_cbr = rscv.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_cbr,xtrain,ytrain,xtest,ytest)

Model performance for Training set
- Root Mean Squared Error: 90040.5576
- Mean Absolute Error: 59298.1906
- R2 Score: 0.9379
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 120705.5975
- Mean Absolute Error: 69195.4418
- R2 Score: 0.9036
