In [26]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, f1_score, confusion_matrix,mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as meae
from sklearn.metrics import mean_squared_error as mse
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [27]:
nyc_data = pd.read_csv('../data/nyc_without_featureselection.csv')
nyc_data.head()

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,1,108,1,5.010635,1,9,0.21,6,365
1,2845,2,127,0,5.420535,1,45,0.38,2,355
2,4632,2,94,1,5.01728,3,0,0.0,1,365
3,4869,1,41,0,4.49981,1,270,4.64,1,194
4,7192,2,61,0,4.394449,10,9,0.1,1,0


In [28]:
nyc_data_1 = nyc_data.copy()
nyc_data_1 = nyc_data.drop('price',axis=1)
X = nyc_data_1
y = nyc_data["price"]

In [29]:
print(X.shape)
print(X.dtypes)

(48895, 9)
host_id                             int64
neighbourhood_group                 int64
neighbourhood                       int64
room_type                           int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object


In [30]:
print(y.shape)
print(y.dtypes)

(48895,)
float64


In [31]:
X = preprocessing.StandardScaler().fit_transform(X)

To Divide the dataset into train and test data

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [33]:
x_train = preprocessing.StandardScaler().fit_transform(x_train)
x_test = preprocessing.StandardScaler().fit_transform(x_test)

In [34]:
kf = KFold(n_splits=5)

Function to print Train and Test metric scores for k-fold

In [35]:
def metrics_table_kfold(model, x_test, y_test,x_train,y_train):
    metrics = []
    
    # Train Test
    ypred_train = model.predict(x_train)
    mae_train = mean_absolute_error(y_train, ypred_train)    
    metrics.append(mae_train)   
    mse_train = mean_squared_error(y_train, ypred_train)   
    metrics.append(mse_train)   
    r2_Train = r2_score(y_train, ypred_train, multioutput='variance_weighted')   
    metrics.append(r2_Train)  
    # Test Data
    ypred_test = model.predict(x_test)
    mae_Test = mean_absolute_error(y_test, ypred_test)   
    metrics.append(mae_Test)    
    mse_Test = mean_squared_error(y_test, ypred_test)  
    metrics.append(mse_Test)
    r2_Test = r2_score(y_test, ypred_test, multioutput='variance_weighted')  
    metrics.append(r2_Test)
    
    return metrics

Function to print Train and Test metric scores for k-fold

In [36]:
def metrics_train_split(model, x_test, y_test,x_train,y_train):
    metrics = []
    
    print('\nTrain data')
    
    ypred_train = model.predict(x_train)
    mae_train = mean_absolute_error(y_train, ypred_train)
    print('MAE_train: ', round(mae_train, 3))   
    mse_train = mean_squared_error(y_train, ypred_train)
    print("MSE_train: %.3f" % mse_train)   
    r2_Train = r2_score(y_train, ypred_train, multioutput='variance_weighted')
    print("R2_train: ",round(r2_Train,3))   
    
    print('\nTest data')
    
    ypred_test = model.predict(x_test)
    mae_Test = mean_absolute_error(y_test, ypred_test)
    print('MAE_test: ', round(mae_Test, 3))  
    mse_Test = mean_squared_error(y_test, ypred_test)
    print('MAE_test: ', round(mse_Test, 3))  
    r2_Test = r2_score(y_test, ypred_test, multioutput='variance_weighted')
    print("R2_test: ",round(r2_Test,3)) 

Model 1: Ridge Regression 

Using Train Split Method

In [37]:
from sklearn.linear_model import Ridge

ridge = Ridge(normalize=True, alpha = 0.1).fit(x_train, y_train)
print("Training Score " + str(ridge.score(x_train, y_train)))
print("Test Score " + str(ridge.score(x_test, y_test)))
metrics_train_split(ridge,x_test,y_test,x_train,y_train)

Training Score 0.402554401137569
Test Score 0.3980644766260587

Train data
MAE_train:  0.394
MSE_train: 0.289
R2_train:  0.403

Test data
MAE_test:  0.396
MAE_test:  0.29
R2_test:  0.398


Using K-Fold Cross Validation Method

For RIDGE Regression k-fold cross validation method Tried with alpha values = {0.1,0.3,0.5,0.7,0.9,1} and 0.1 gave good R square values.


In [38]:
from sklearn.linear_model import Ridge

mae_train = []
mse_train = []
r2_train = []
mae_test = []
mse_test = []
r2_test = []
i=1

for trainIndex, testIndex in kf.split(X):
    xTrain, xTest, yTrain, yTest = X[trainIndex], X[testIndex], y[trainIndex], y[testIndex]
    
    ridge = Ridge(normalize=True, alpha = 0.1).fit(xTrain, yTrain)
    print("Training Score " + str(ridge.score(xTrain, yTrain)))
    print("Test Score " + str(ridge.score(xTest, yTest)))
    metrics = metrics_table_kfold(ridge,xTest,yTest,xTrain,yTrain)
                                                   
    mae_train.append(metrics[0])
    mse_train.append(metrics[1])
    r2_train.append(metrics[2])
    mae_test.append(metrics[3])
    mse_test.append(metrics[4])
    r2_test.append(metrics[5])
    i=i+1

print("\nTrain and Test Scores of Ridge Regression Using K-Fold Cross Validation \n")
print('MAE_train: ', round(np.mean(mae_train), 3))
print('MSE_train: ', round(np.mean(mse_train), 3))
print('R2_train: ', round(np.mean(r2_train), 3))
print('MAE_test: ', round(np.mean(mae_test), 3))
print('MAE_test: ', round(np.mean(mse_test), 3))
print('R2_test: ', round(np.mean(r2_test), 3))

Training Score 0.4132653014695966
Test Score 0.3331989854314915
Training Score 0.40052667786713725
Test Score 0.40948775786587277
Training Score 0.4082422795418086
Test Score 0.36382992657612945
Training Score 0.3955658259668401
Test Score 0.4206978601574872
Training Score 0.39220943246844575
Test Score 0.40920007147517046

Train and Test Scores of Ridge Regression Using K-Fold Cross Validation 

MAE_train:  0.394
MSE_train:  0.289
R2_train:  0.402
MAE_test:  0.397
MAE_test:  0.293
R2_test:  0.387


Model 2: Linear Regression

In [39]:
lr = LinearRegression(normalize=True).fit(x_train, y_train)
# Training Score
print("Training Score " + str(lr.score(x_train, y_train)))
# Test Score
print("Test Score " + str(lr.score(x_test, y_test)))
metrics_train_split(lr,x_test,y_test,x_train,y_train)

Training Score 0.40589503435125296
Test Score 0.40100156040482215

Train data
MAE_train:  0.392
MSE_train: 0.288
R2_train:  0.406

Test data
MAE_test:  0.394
MAE_test:  0.288
R2_test:  0.401


In [40]:
kf = KFold(n_splits=5)

coefs = []
mae_train = []
mse_train = []
r2_train = []
mae_test = []
mse_test = []
r2_test = []
i=1
for trainIndex, testIndex in kf.split(X):
    
    xTrain, xTest, yTrain, yTest = X[trainIndex], X[testIndex], y[trainIndex], y[testIndex]
    
    lr = LinearRegression(normalize=True).fit(xTrain, yTrain)
    # Training Score
    print("Training Score in Fold "+str(i)+" is "+str(round(lr.score(xTrain,yTrain),3)))
    # Test Score
    print("Test Score in Fold " + str(i)+" is "+str(round(lr.score(xTest, yTest),3)))
   
    metrics = metrics_table_kfold(lr,xTest,yTest,xTrain,yTrain)
                                                   
    mae_train.append(metrics[0])
    mse_train.append(metrics[1])
    r2_train.append(metrics[2])
    mae_test.append(metrics[3])
    mse_test.append(metrics[4])
    r2_test.append(metrics[5])
    i=i+1
    
print("\nTrain and Test Scores of Linear Regression Using K-Fold Cross Validation \n")
print('MAE_train: ', round(np.mean(mae_train), 3))
print('MSE_train: ', round(np.mean(mse_train), 3))
print('R2_train: ', round(np.mean(r2_train), 3))
print('MAE_test: ', round(np.mean(mae_test), 3))
print('MAE_test: ', round(np.mean(mse_test), 3))
print('R2_test: ', round(np.mean(r2_test), 3))

Training Score in Fold 1 is 0.417
Test Score in Fold 1 is 0.329
Training Score in Fold 2 is 0.404
Test Score in Fold 2 is 0.409
Training Score in Fold 3 is 0.412
Test Score in Fold 3 is 0.366
Training Score in Fold 4 is 0.399
Test Score in Fold 4 is 0.426
Training Score in Fold 5 is 0.395
Test Score in Fold 5 is 0.421

Train and Test Scores of Linear Regression Using K-Fold Cross Validation 

MAE_train:  0.393
MSE_train:  0.287
R2_train:  0.405
MAE_test:  0.396
MAE_test:  0.291
R2_test:  0.39


Model 3: SVR

In [43]:
from sklearn.svm import SVR


clf = SVR(C=1.0, epsilon=0.2,gamma='auto')
clf.fit(x_train, y_train)

# Training Score
print("Training Score " + str(clf.score(x_train, y_train)))
# Test Score
print("Test Score " + str(clf.score(x_test, y_test)))
metrics_train_split(clf,x_test,y_test,x_train,y_train)

Training Score 0.5354375924585115
Test Score 0.5178179111811203

Train data
MAE_train:  0.334
MSE_train: 0.225
R2_train:  0.535

Test data
MAE_test:  0.343
MAE_test:  0.232
R2_test:  0.518


In [None]:
from sklearn.svm import SVR

mae_train = []
mse_train = []
r2_train = []
mae_test = []
mse_test = []
r2_test = []
i=1

for trainIndex, testIndex in kf.split(X):
    xTrain, xTest, yTrain, yTest = X[trainIndex], X[testIndex], y[trainIndex], y[testIndex]
    
    clf = SVR(C=1.0, epsilon=0.2,gamma='auto')
    clf.fit(xTrain, yTrain)

    # Training Score
    print("Training Score in Fold "+str(i)+" is "+str(round(clf.score(xTrain,yTrain),3)))
    # Test Score
    print("Test Score in Fold " + str(i)+" is "+str(round(clf.score(xTest, yTest),3)))
    metrics = metrics_table_kfold(clf,xTest,yTest,xTrain,yTrain)
                                                   
    mae_train.append(metrics[0])
    mse_train.append(metrics[1])
    r2_train.append(metrics[2])
    mae_test.append(metrics[3])
    mse_test.append(metrics[4])
    r2_test.append(metrics[5])
    i=i+1

print("\nTrain and Test Scores of SVM Regression Using K-Fold Cross Validation \n")
print('MAE_train: ', round(np.mean(mae_train), 3))
print('MSE_train: ', round(np.mean(mse_train), 3))
print('R2_train: ', round(np.mean(r2_train), 3))
print('MAE_test: ', round(np.mean(mae_test), 3))
print('MAE_test: ', round(np.mean(mse_test), 3))
print('R2_test: ', round(np.mean(r2_test), 3))

Training Score in Fold 1 is 0.551
Test Score in Fold 1 is 0.418
Training Score in Fold 2 is 0.54
Test Score in Fold 2 is 0.498
Training Score in Fold 3 is 0.544
Test Score in Fold 3 is 0.474
Training Score in Fold 4 is 0.527
Test Score in Fold 4 is 0.541
Training Score in Fold 5 is 0.518
Test Score in Fold 5 is 0.548


Gradient Boosting Model using train-test split

In [41]:
### Gradiend boosting with different learning rate values
from sklearn import ensemble

lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:

    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': learning_rate, 'loss': 'ls'}
    clf = ensemble.GradientBoostingRegressor(**params)
    clf.fit(x_train, y_train)
    
        
    print("\nLearning rate :" + str(learning_rate))
    # Training Score
    print("Training Score " + str(clf.score(x_train, y_train)))
    # Test Score
    print("Test Score " + str(clf.score(x_test, y_test)))
    metrics_train_split(clf,x_test,y_test,x_train,y_train)


Learning rate :0.05
Training Score 0.6225177749383515
Test Score 0.527276368411363

Train data
MAE_train:  0.305
MSE_train: 0.183
R2_train:  0.623

Test data
MAE_test:  0.348
MAE_test:  0.228
R2_test:  0.527

Learning rate :0.075
Training Score 0.6428641135606709
Test Score 0.5228760800032106

Train data
MAE_train:  0.298
MSE_train: 0.173
R2_train:  0.643

Test data
MAE_test:  0.35
MAE_test:  0.23
R2_test:  0.523

Learning rate :0.1
Training Score 0.6569570932372176
Test Score 0.521560729808193

Train data
MAE_train:  0.292
MSE_train: 0.166
R2_train:  0.657

Test data
MAE_test:  0.35
MAE_test:  0.23
R2_test:  0.522

Learning rate :0.25
Training Score 0.7201711251403425
Test Score 0.4857428330580391

Train data
MAE_train:  0.267
MSE_train: 0.135
R2_train:  0.72

Test data
MAE_test:  0.365
MAE_test:  0.248
R2_test:  0.486

Learning rate :0.5
Training Score 0.7725801799588321
Test Score 0.4101940564217107

Train data
MAE_train:  0.242
MSE_train: 0.110
R2_train:  0.773

Test data
MAE_test

Gradient Boosting Model using k-fold

In [42]:
kf = KFold(n_splits=5)

mae_train = []
mse_train = []
r2_train = []
mae_test = []
mse_test = []
r2_test = []
lr_list = [ 0.075, 0.1, 0.25]

for learning_rate in lr_list:
    i=1
    for trainIndex, testIndex in kf.split(X):
        xTrain, xTest, yTrain, yTest = X[trainIndex], X[testIndex], y[trainIndex], y[testIndex]
    
        params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': learning_rate, 'loss': 'ls'}
        clf = ensemble.GradientBoostingRegressor(**params)
        clf.fit(xTrain, yTrain)
        print("\nLearning rate :" + str(learning_rate))
        
         # Training Score
        print("Training Score in Fold "+str(i)+" is "+str(round(clf.score(xTrain,yTrain),3)))
        # Test Score
        print("Test Score in Fold " + str(i)+" is "+str(round(clf.score(xTest, yTest),3)))
        metrics = metrics_table_kfold(clf,xTest,yTest,xTrain,yTrain)
                                                   
        mae_train.append(metrics[0])
        mse_train.append(metrics[1])
        r2_train.append(metrics[2])
        mae_test.append(metrics[3])
        mse_test.append(metrics[4])
        r2_test.append(metrics[5])
        i=i+1


    print("\nTrain and Test Scores of Gradient Boost Classifier Using K-Fold Cross Validation \n")
    print('MAE_train: ', round(np.mean(mae_train), 3))
    print('MSE_train: ', round(np.mean(mse_train), 3))
    print('R2_train: ', round(np.mean(r2_train), 3))
    print('MAE_test: ', round(np.mean(mae_test), 3))
    print('MAE_test: ', round(np.mean(mse_test), 3))
    print('R2_test: ', round(np.mean(r2_test), 3))


Learning rate :0.075
Training Score in Fold 1 is 0.655
Test Score in Fold 1 is 0.491

Learning rate :0.075
Training Score in Fold 2 is 0.647
Test Score in Fold 2 is 0.574

Learning rate :0.075
Training Score in Fold 3 is 0.65
Test Score in Fold 3 is 0.54

Learning rate :0.075
Training Score in Fold 4 is 0.639
Test Score in Fold 4 is 0.606

Learning rate :0.075
Training Score in Fold 5 is 0.629
Test Score in Fold 5 is 0.56

Train and Test Scores of Gradient Boost Classifier Using K-Fold Cross Validation 

MAE_train:  0.298
MSE_train:  0.172
R2_train:  0.644
MAE_test:  0.33
MAE_test:  0.213
R2_test:  0.554

Learning rate :0.1
Training Score in Fold 1 is 0.67
Test Score in Fold 1 is 0.49

Learning rate :0.1
Training Score in Fold 2 is 0.661
Test Score in Fold 2 is 0.575

Learning rate :0.1
Training Score in Fold 3 is 0.665
Test Score in Fold 3 is 0.539

Learning rate :0.1
Training Score in Fold 4 is 0.654
Test Score in Fold 4 is 0.607

Learning rate :0.1
Training Score in Fold 5 is 0.646