## MODEL TRAINING
- Import Data and Required Packages

In [16]:
import numpy as np
import pandas as pd


## modelling 

from sklearn.model_selection import train_test_split


## preprocessing
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## metrics

from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

## models

from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor



In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,smartlocation,roomtype,price,minimumnights,availability365,numberofreviews,reviewscoresrating,cancellationpolicy
0,Amsterdam,Entire home/apt,600.0,2.0,74.0,31.0,89.0,strict
1,Amsterdam,Entire home/apt,175.0,2.0,259.0,15.0,99.0,strict
2,Amsterdam,Entire home/apt,125.0,4.0,0.0,1.0,100.0,flexible
3,Amsterdam,Entire home/apt,130.0,3.0,0.0,22.0,97.0,flexible
4,Amsterdam,Entire home/apt,80.0,2.0,326.0,16.0,78.0,moderate


In [4]:
df.tail()

Unnamed: 0,smartlocation,roomtype,price,minimumnights,availability365,numberofreviews,reviewscoresrating,cancellationpolicy
345367,Parkdale,Entire home/apt,79.0,1.0,97.0,12.0,95.0,flexible
345368,Yarraville,Entire home/apt,189.0,3.0,364.0,4.0,95.0,flexible
345369,Footscray,Private room,30.0,1.0,0.0,4.0,60.0,moderate
345370,Yarraville,Private room,42.0,7.0,358.0,9.0,93.0,moderate
345371,West Footscray,Private room,33.0,3.0,337.0,3.0,100.0,strict


In [5]:
## dependent and independent features

X = df.drop('price',axis=1)
y = df.price


print(f"shape of X: {X.shape}")
print(f"shape of y: {y.shape}")

shape of X: (345372, 7)
shape of y: (345372,)


In [8]:
cat_cols_ohe = ['roomtype','cancellationpolicy','smartlocation']

num_cols_st = ['minimumnights','numberofreviews']
num_cols_minmax = ['availability365','reviewscoresrating']





preprocessor = ColumnTransformer([
    
    # ('lbe',LabelEncoder(),list(cat_cols_lb)),
    ('ohe',OneHotEncoder(handle_unknown='ignore'),cat_cols_ohe),
    ('minmax',MinMaxScaler(),num_cols_minmax),
    ('ss',StandardScaler(),num_cols_st)
    
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9)


print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_test: {X_test.shape}\n")
print(f"shape of y_train: {y_train.shape}")
print(f"shape of y_test: {y_test.shape}")



shape of X_train: (241760, 7)
shape of X_test: (103612, 7)

shape of y_train: (241760,)
shape of y_test: (103612,)


In [9]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [11]:
print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_test: {X_test.shape}")

shape of X_train: (241760, 3755)
shape of X_test: (103612, 3755)


## MODEL TRAINING

In [14]:
def evaluate_model(actual, predicted):
    
    """
    returns mae, mse, rmse, r2
    """

    mae = mean_absolute_error(actual,predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    r2 = r2_score(actual, predicted)
    return mae, mse, rmse, r2

In [19]:
models = {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "XGBRegressor":XGBRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    ## define model
    model = list(models.values())[i]

    # train model
    model.fit(X_train,y_train)

    ## predict

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    ## evaluate model

    train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print(f"Model performance on training data\n")

    print(f" - Mean Absolute Error: {train_mae} ")
    print(f" - Mean Squared Error: {train_mse} ")
    print(f" - Root Mean Squared Error: {train_rmse} ")
    print(f" - R2 Score: {train_r2} ")


    print("###########################################################\n")

    print(f"Model Performance on test data\n")

    print(f" - Mean Absolute Error: {test_mae} ")
    print(f" - Mean Squared Error: {test_mse} ")
    print(f" - Root Mean Squared Error: {test_rmse} ")
    print(f" - R2 Score: {test_r2} ")

    r2_list.append(test_r2)




    



TypeError: 'dict_keys' object is not subscriptable