In [2]:
# import the library
%matplotlib inline
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression

# sklearn :: evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

sns.set_style('whitegrid')

# Load the data

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df_train.shape, df_test.shape)

(51000, 29) (23111, 28)


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51000 entries, 0 to 50999
Data columns (total 29 columns):
id                        51000 non-null int64
log_price                 51000 non-null float64
property_type             51000 non-null object
room_type                 51000 non-null object
amenities                 51000 non-null object
accommodates              51000 non-null int64
bathrooms                 50867 non-null float64
bed_type                  51000 non-null object
cancellation_policy       51000 non-null object
cleaning_fee              51000 non-null bool
city                      51000 non-null object
description               51000 non-null object
first_review              40144 non-null object
host_has_profile_pic      50873 non-null object
host_identity_verified    50873 non-null object
host_response_rate        38449 non-null object
host_since                50873 non-null object
instant_bookable          51000 non-null object
last_review               401

In [37]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23111 entries, 0 to 23110
Data columns (total 28 columns):
id                        23111 non-null int64
property_type             23111 non-null object
room_type                 23111 non-null object
amenities                 23111 non-null object
accommodates              23111 non-null int64
bathrooms                 23044 non-null float64
bed_type                  23111 non-null object
cancellation_policy       23111 non-null object
cleaning_fee              23111 non-null bool
city                      23111 non-null object
description               23111 non-null object
first_review              18103 non-null object
host_has_profile_pic      23050 non-null object
host_identity_verified    23050 non-null object
host_response_rate        17363 non-null object
host_since                23050 non-null object
instant_bookable          23111 non-null object
last_review               18110 non-null object
latitude                  2311

# Feature Engineering

In [4]:
# zipcode to numerical columns
df_zipcode = pd.get_dummies(df_train['zipcode'])
df = pd.concat([df_train, df_zipcode], axis=1).fillna(0.0)

# Discover and Visualize the Data

In [5]:
corr_matrix = df.corr()
corr_matrix["log_price"].sort_values(ascending=False)

log_price               1.000000
accommodates            0.566629
bedrooms                0.470452
beds                    0.443833
bathrooms               0.350941
cleaning_fee            0.113761
90265                   0.083276
10014                   0.071377
90291                   0.061428
94114                   0.059492
90210                   0.057635
94109                   0.057139
20003                   0.056698
10019                   0.056220
94110                   0.055475
10011.0                 0.054081
10012                   0.053166
94123                   0.053024
02116                   0.047036
94117                   0.045714
10036                   0.045265
10017                   0.045039
10023                   0.044492
10016                   0.044311
94107                   0.043736
90272                   0.042798
90292                   0.042060
20001                   0.041501
10011                   0.041322
94133                   0.040498
          

In [6]:
# Choose test variables
X_columns = ['accommodates', 'bedrooms', 'beds', 'bathrooms','cleaning_fee', 'latitude','longitude', 'number_of_reviews']
y_column = ['log_price']

In [7]:
df_train = df[X_columns + y_column]
print(df_train.shape)

(51000, 9)


# Model Training


In [8]:
# split the data
threshold = 0.8
X = df[X_columns]
y = df[y_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (40800, 8)
y_train (40800, 1)
X_test (10200, 8)
y_test (10200, 1)


### Train a linear regression model


In [14]:
linmodel = LinearRegression()
linmodel.fit(X_train, y_train)
print("Predictions:\t", linmodel.predict(X_test))

y_pred = linmodel.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('MAE', round(mae, 3))
print('RMSE', round(rmse, 3))

Predictions:	 [[4.49532203]
 [4.83478989]
 [4.41477485]
 ...
 [4.42376937]
 [4.52002473]
 [4.38818613]]
MAE 0.44
RMSE 0.574


### Train a KNN model

In [11]:
from sklearn.neighbors import KNeighborsRegressor
knnmodel = KNeighborsRegressor(10)
knnmodel.fit(X_train, y_train)
print("Predictions:\t", knnmodel.predict(X_test))
y_pred = knnmodel.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('MAE', round(mae, 3))
print('RMSE', round(rmse, 3))

Predictions:	 [[6.05401067]
 [4.6040587 ]
 [4.83947253]
 ...
 [5.00671649]
 [5.01484242]
 [4.39271967]]
MAE 0.381
RMSE 0.497


### Train a DecisionTreeRegressor model


In [12]:
from sklearn.tree import DecisionTreeRegressor
treemodel = DecisionTreeRegressor()
treemodel.fit(X_train, y_train)
print("Predictions:\t", treemodel.predict(X_test))
y_pred = treemodel.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('MAE', round(mae, 3))
print('RMSE', round(rmse, 3))

Predictions:	 [6.05208917 4.60517019 4.35670883 ... 4.44265126 5.11799381 4.44265126]
MAE 0.44
RMSE 0.594


### Train a RandomForestRegressor model

In [9]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(n_estimators=10, random_state=42)
forest_model.fit(X_train, y_train)
print("Predictions:\t", forest_model.predict(X_test))
y_pred = forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('MAE', round(mae, 3))
print('RMSE', round(rmse, 3))

  This is separate from the ipykernel package so we can avoid doing imports until


Predictions:	 [6.06605082 4.66804985 4.76104873 ... 5.06881259 5.04780332 4.10442699]
MAE 0.338
RMSE 0.452


# Experiments

In [13]:
def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    print(model_name)
    print('MAE', mean_absolute_error(y_test, y_pred))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)
    
run_experiment('Linear Regression', LinearRegression(), X_train, y_train, X_test)
run_experiment('KNN 5', KNeighborsRegressor(5), X_train, y_train, X_test)
run_experiment('KNN 2', KNeighborsRegressor(2), X_train, y_train, X_test)
run_experiment('Decision Tree', DecisionTreeRegressor(), X_train, y_train, X_test)
run_experiment('Random Forest 10', RandomForestRegressor(10), X_train, y_train, X_test)
run_experiment('Random Forest 100', RandomForestRegressor(100), X_train, y_train, X_test)

Linear Regression
MAE 0.4380856805805362
RMSE 0.5688885832905373

KNN 5
MAE 0.3901616401243427
RMSE 0.5090273001200086

KNN 2
MAE 0.42605339829762207
RMSE 0.556251215559618

Decision Tree
MAE 0.440826273370622
RMSE 0.5967556322897859



  


Random Forest 10
MAE 0.3416440064516261
RMSE 0.45688385286884303



  


Random Forest 100
MAE 0.3275545375642769
RMSE 0.436953659879342



# Cross-Validation

### Cross Validate DecisionTreeRegressor model

In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(treemodel, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)
treermse_scores = np.sqrt(-scores)

In [20]:
def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())
        
display_scores(treermse_scores)

Scores: [0.59602422 0.59597392 0.58794239 0.60584471 0.59679805 0.6113046
 0.59458838 0.58929303 0.60767431 0.59796133]
Mean: 0.5983404930906157
Standard deviation: 0.007273552038214862


### Cross Validate Linear regression model

In [21]:
linear_scores = cross_val_score(linmodel, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)
linrmse_scores = np.sqrt(-linear_scores)
display_scores(linrmse_scores)

Scores: [0.57115925 0.57826245 0.56551705 0.586686   0.56437463 0.58641674
 0.5809377  0.58454427 0.57835809 0.58398537]
Mean: 0.5780241547863357
Standard deviation: 0.00788690064077281


### Cross Validate RandomForestRegressor model

In [22]:
forest_reg = cross_val_score(forest_model, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_reg)
display_scores(forest_rmse_scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Scores: [0.47014836 0.45794068 0.44925165 0.471949   0.46138518 0.46496138
 0.46578507 0.46427745 0.46239954 0.46248656]
Mean: 0.4630584864597112
Standard deviation: 0.006018559694205066


# Fine tuning and Error Analysis

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': ['auto', 'sqrt', 'log2']},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_model = RandomForestRegressor(n_estimators=10, random_state=42)
grid_search = GridSearchCV(forest_model, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': ['auto', 'sqrt', 'log2']}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [16]:
forest_model = grid_search.best_estimator_
forest_model.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [17]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(np.sqrt(-mean_score), params)

0.5137840915085193 {'max_features': 'auto', 'n_estimators': 3}
0.4686574060159986 {'max_features': 'auto', 'n_estimators': 10}
0.4555303122502129 {'max_features': 'auto', 'n_estimators': 30}
0.5166752047026659 {'max_features': 'sqrt', 'n_estimators': 3}
0.46980606805873454 {'max_features': 'sqrt', 'n_estimators': 10}
0.45601325659309233 {'max_features': 'sqrt', 'n_estimators': 30}
0.5111557667243116 {'max_features': 'log2', 'n_estimators': 3}
0.4654408563638325 {'max_features': 'log2', 'n_estimators': 10}
0.4520956963029755 {'max_features': 'log2', 'n_estimators': 30}
0.5233976390623727 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.48260459611779644 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.5219777869776334 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.4785971421280082 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
0.5156343071827958 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
0.477623280269566 {'bootstrap': 

In [18]:
Forest_model = RandomForestRegressor(100)
Forest_model.fit(X_train, y_train)
y_pred = Forest_model.predict(X_test)

  


In [27]:
df_test = pd.DataFrame(X_test).copy()
df_test['log_price'] = y_test
df_test['prediction'] = y_pred
df_test['abs_error'] = abs(df_test['log_price']-df_test['prediction'])
df_test.sort_values(by='abs_error', ascending=False).round(3)

Unnamed: 0,accommodates,bedrooms,beds,bathrooms,cleaning_fee,latitude,longitude,number_of_reviews,log_price,prediction,abs_error
31993,2,1.0,1.0,1.0,True,34.057,-118.345,0,7.313,4.258,3.055
5391,1,1.0,1.0,1.0,False,40.800,-73.967,0,6.685,3.784,2.901
1461,2,1.0,1.0,1.0,True,34.078,-118.268,0,7.090,4.247,2.843
16430,1,1.0,1.0,1.0,False,40.712,-73.949,0,7.170,4.427,2.744
37960,2,1.0,1.0,2.0,False,40.713,-73.991,12,7.313,4.632,2.681
4172,1,3.0,1.0,1.0,False,40.801,-73.943,0,6.685,4.085,2.599
6139,1,1.0,1.0,1.0,False,40.756,-73.994,0,7.279,4.699,2.580
21939,1,1.0,1.0,1.0,True,40.677,-73.881,8,6.477,3.902,2.574
23768,2,1.0,1.0,1.0,False,40.717,-74.005,0,7.313,4.743,2.570
22582,1,1.0,1.0,2.0,False,40.708,-73.943,0,6.659,4.102,2.557


# Evaluate Test Set

In [28]:
final_model = grid_search.best_estimator_

# X_test2 = X_test.drop("log_price", axis=1)
# y_test2 = y_test["log_price"].copy()


final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [29]:
final_rmse

0.4483700543552886

# Prepare submission

In [38]:
df_prediction = df_test[X_columns].fillna(0.0)
df_test['log_price'] = Forest_model.predict(df_prediction)
df_test[['id', 'log_price']]

Unnamed: 0,id,log_price
0,986942,4.130356
1,16436737,4.623885
2,18209634,3.984559
3,15027024,4.274709
4,18074243,4.933524
5,8446914,5.383702
6,19362916,4.654230
7,16116059,6.003726
8,20912583,4.755470
9,13573101,4.716643


In [39]:
df_test[['id', 'log_price']].to_csv('submission_linear_regression.csv', index=False)