In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [91]:
df = pd.read_csv("AB_NYC_2019.csv")
print(df.shape)
print(df.info())

(48895, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 no

In [121]:
# Hence, we see that there are 16 columns and 48895 rows
# Missing values are there in columns name, host_name, last_review, reviews_per_month. 
# ID, name and host_name column has no affect on Price. Hence can be removed
df1 = df.drop(['id', 'name', 'host_name'],axis=1)
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 13 columns):
host_id                           48895 non-null int64
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64
dtypes: float64(3), int64(6), object(4)
memory usage: 4.8+ MB
None


In [122]:
# now we have null values in last_review and reviews_per_month
df1.isnull().sum()

host_id                               0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [123]:
# number of null values in last_review and reviews_per_months is same. hence there seems to be a relation. However, we cannot imput 
# last_review but we can for reviews_per_month
df1.head(10)

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,7322,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
6,7356,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,45,49,2017-10-05,0.4,1,0
7,8967,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
8,7490,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
9,7549,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188


In [124]:
df1.loc[df1['reviews_per_month'].isnull(),'reviews_per_month']=0
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 13 columns):
host_id                           48895 non-null int64
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 48895 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64
dtypes: float64(3), int64(6), object(4)
memory usage: 4.8+ MB


In [125]:
# remove last_review as we have dealt with reviews_per_month by imputing missing values with 0
df1 = df1.drop('last_review',axis = 1)
df1.head()

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,0.0,1,365
3,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [126]:
# analyse the statistics of price group by neighbourhood. we observe there is minimum price as 0
df1.groupby('neighbourhood_group')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bronx,1091.0,87.496792,106.709349,0.0,45.0,65.0,99.0,2500.0
Brooklyn,20104.0,124.383207,186.873538,0.0,60.0,90.0,150.0,10000.0
Manhattan,21661.0,196.875814,291.383183,0.0,95.0,150.0,220.0,10000.0
Queens,5666.0,99.517649,167.102155,10.0,50.0,75.0,110.0,10000.0
Staten Island,373.0,114.812332,277.620403,13.0,50.0,75.0,110.0,5000.0


In [127]:
# analyse the statistics of minimum nights group by neighbourhood
df1.groupby('neighbourhood_group')['minimum_nights'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bronx,1091.0,4.560953,15.631792,1.0,1.0,2.0,3.0,365.0
Brooklyn,20104.0,6.056556,17.632726,1.0,2.0,3.0,5.0,999.0
Manhattan,21661.0,8.579151,24.050857,1.0,1.0,3.0,6.0,1250.0
Queens,5666.0,5.181433,15.028725,1.0,1.0,2.0,3.0,500.0
Staten Island,373.0,4.831099,19.727605,1.0,1.0,2.0,3.0,365.0


In [129]:
# As price cannot be 0 and minimum nights cannot be more than 365; hence we remove these rows
df1 = df1[df1['price']>0]
df1 = df1[df1['minimum_nights']<=365]
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48870 entries, 0 to 48894
Data columns (total 12 columns):
host_id                           48870 non-null int64
neighbourhood_group               48870 non-null object
neighbourhood                     48870 non-null object
latitude                          48870 non-null float64
longitude                         48870 non-null float64
room_type                         48870 non-null object
price                             48870 non-null int64
minimum_nights                    48870 non-null int64
number_of_reviews                 48870 non-null int64
reviews_per_month                 48870 non-null float64
calculated_host_listings_count    48870 non-null int64
availability_365                  48870 non-null int64
dtypes: float64(3), int64(6), object(3)
memory usage: 4.8+ MB


Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,0.0,1,365
3,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [130]:
# analyse the statistics of availability_365 as per neighbourhood
df1.groupby('neighbourhood_group')['availability_365'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bronx,1090.0,165.794495,135.304079,0.0,37.0,148.0,313.75,365.0
Brooklyn,20089.0,100.159739,126.243695,0.0,0.0,28.0,188.0,365.0
Manhattan,21654.0,111.929667,132.641048,0.0,0.0,36.0,230.0,365.0
Queens,5664.0,144.422493,135.52891,0.0,2.0,98.0,286.0,365.0
Staten Island,373.0,199.678284,131.852,0.0,78.0,219.0,333.0,365.0


In [131]:
# now data is ready for prediction. Calculate pairwise correlation matrix to see how 
#different variables are related to price
corr = df1.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)


Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
host_id,1.0,0.02,0.13,0.015,-0.019,-0.14,0.21,0.15,0.2
latitude,0.02,1.0,0.085,0.034,0.03,-0.015,-0.019,0.02,-0.011
longitude,0.13,0.085,1.0,-0.15,-0.074,0.059,0.14,-0.11,0.083
price,0.015,0.034,-0.15,1.0,0.054,-0.048,-0.051,0.057,0.082
minimum_nights,-0.019,0.03,-0.074,0.054,1.0,-0.095,-0.15,0.16,0.16
number_of_reviews,-0.14,-0.015,0.059,-0.048,-0.095,1.0,0.59,-0.072,0.17
reviews_per_month,0.21,-0.019,0.14,-0.051,-0.15,0.59,1.0,-0.047,0.16
calculated_host_listings_count,0.15,0.02,-0.11,0.057,0.16,-0.072,-0.047,1.0,0.23
availability_365,0.2,-0.011,0.083,0.082,0.16,0.17,0.16,0.23,1.0


In [132]:
# one hot encoding for categorical variables using get dummies
df1 = pd.get_dummies(df1,prefix_sep='_', drop_first=True)
df1.head()




Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Brooklyn,...,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Private room,room_type_Shared room
0,2787,40.64749,-73.97237,149,1,9,0.21,6,365,1,...,0,0,0,0,0,0,0,0,1,0
1,2845,40.75362,-73.98377,225,1,45,0.38,2,355,0,...,0,0,0,0,0,0,0,0,0,0
2,4632,40.80902,-73.9419,150,3,0,0.0,1,365,0,...,0,0,0,0,0,0,0,0,1,0
3,4869,40.68514,-73.95976,89,1,270,4.64,1,194,1,...,0,0,0,0,0,0,0,0,0,0
4,7192,40.79851,-73.94399,80,10,9,0.1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
# VIF calculation
# pd.DataFrame(np.linalg.inv(df1.corr().values), index = corr.index, columns=corr.columns)

pd.DataFrame(np.linalg.inv(corr.values), index = corr.index, columns=corr.columns)



Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
host_id,1.269276,-0.018199,-0.115148,-0.012748,0.050964,0.524971,-0.523048,-0.152586,-0.226645
latitude,-0.018199,1.013056,-0.102948,-0.048133,-0.033867,-0.012066,0.029345,-0.028241,0.036413
longitude,-0.115148,-0.102948,1.090435,0.16231,0.054185,0.018598,-0.098143,0.148636,-0.110921
price,-0.012748,-0.048133,0.16231,1.038969,-0.01723,0.040106,0.021986,-0.008342,-0.102238
minimum_nights,0.050964,-0.033867,0.054185,-0.01723,1.084169,0.048498,0.139856,-0.120683,-0.196226
number_of_reviews,0.524971,-0.012066,0.018598,0.040106,0.048498,1.778933,-1.108057,0.044738,-0.254619
reviews_per_month,-0.523048,0.029345,-0.098143,0.021986,0.139856,-1.108057,1.806041,0.056984,-0.028035
calculated_host_listings_count,-0.152586,-0.028241,0.148636,-0.008342,-0.120683,0.044738,0.056984,1.119305,-0.230878
availability_365,-0.226645,0.036413,-0.110921,-0.102238,-0.196226,-0.254619,-0.028035,-0.230878,1.196992


In [134]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = df1.drop('price', axis=1)
print(X.head())

Price = df1['price']
#Features_train, Features_test, Price_train, Price_test = train_test_split(Features, Price, test_size=0.30, random_state=42)
X_train, X_test, Price_train, Price_test = train_test_split(X, Price, test_size=0.3, random_state=42)

   host_id  latitude  longitude  minimum_nights  number_of_reviews  \
0     2787  40.64749  -73.97237               1                  9   
1     2845  40.75362  -73.98377               1                 45   
2     4632  40.80902  -73.94190               3                  0   
3     4869  40.68514  -73.95976               1                270   
4     7192  40.79851  -73.94399              10                  9   

   reviews_per_month  calculated_host_listings_count  availability_365  \
0               0.21                               6               365   
1               0.38                               2               355   
2               0.00                               1               365   
3               4.64                               1               194   
4               0.10                               1                 0   

   neighbourhood_group_Brooklyn  neighbourhood_group_Manhattan  \
0                             1                              0   
1 

In [135]:
print("X", X.shape)
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

X (48870, 234)
X_train: (34209, 234)
X_test: (14661, 234)


In [136]:
from sklearn.linear_model import LinearRegression

print("\nLINEAR (subset)")
rgr = LinearRegression(normalize=True)
rgr.fit(X_train, Price_train)
print("model:", rgr)
print("coeffs:", rgr.coef_)
print("intercept:", rgr.intercept_)

#Evaluate the predictions of the model
Price_predictions_regression = rgr.predict(X_test)
print('R squared:', rgr.score(X_test, Price_test))
print(Price_predictions_regression[:10])
print(Price_test[:10])

Price_predictions_regression




LINEAR (subset)
model: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
coeffs: [ 8.76508627e-08 -8.65698229e+01 -8.98307383e+02 -2.83912192e-01
 -1.73805458e-01 -3.83398659e+00 -2.25655720e-01  2.00095977e-01
  6.07057578e+14 -1.19041251e+15 -1.59298434e+15  3.07492672e+15
 -3.07492672e+15 -3.07492672e+15  1.59298434e+15  1.59298434e+15
 -6.07057578e+14  1.19041251e+15 -6.07057578e+14  1.59298434e+15
 -3.07492672e+15 -1.72573566e+01  1.59298434e+15  1.59298434e+15
 -6.07057578e+14  1.59298434e+15  1.59298434e+15 -2.57303856e+01
 -6.07057578e+14 -6.07057578e+14 -6.07057578e+14 -6.07057578e+14
  1.59298434e+15  1.59298434e+15 -6.07057578e+14 -5.85588996e+01
 -6.07057578e+14 -6.07057578e+14 -3.07492672e+15 -6.07057578e+14
  1.59298434e+15 -6.07057578e+14 -6.07057578e+14 -4.07263387e+01
 -3.07492672e+15  1.19041251e+15  1.19041251e+15  2.10322207e+02
  1.19041251e+15 -4.78658489e+01  7.78022175e+00 -3.07492672e+15
 -6.07057578e+14  3.15004604e+01 -6.07057578

array([120.22620445, 227.96951323, 333.96951323, ..., 278.21951323,
        48.35120445, 272.21951323])

In [137]:
print("\n\nRIDGE")
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.001, normalize=True)
ridge.fit(X_train, Price_train)
print("model:", ridge)
print("ridge coeffs:", ridge.coef_)
print("ridge intercept:", ridge.intercept_)
Price_predictions = ridge.predict(X_test)
print('Ridge R squared:', ridge.score(X_test, Price_test))
print(Price_predictions_regression[:10])
print(Price_test[:10])




RIDGE
model: Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)
ridge coeffs: [ 8.82793222e-08 -2.21601215e+01 -8.45440590e+02 -2.54199420e-01
 -1.77531248e-01 -3.79613957e+00 -2.16848238e-01  1.96572209e-01
 -2.27205617e+01  1.15884879e+00  2.77588289e+00 -9.87716957e+01
 -2.21727784e+02 -7.43139946e+01  1.07871280e+02 -2.93309949e+01
 -1.22460485e+02  4.71761540e+01 -5.88267639e+01  9.14259494e+01
 -1.66478176e+02 -1.50530288e+01  1.55546348e+02  7.04600080e+01
 -1.46490241e+01  1.68956440e+02  1.13752169e+02 -1.22914948e+01
 -1.03245658e+02 -4.05931658e+01 -7.54292924e+00 -9.99148941e+01
  1.38304244e+02  5.76273818e+01 -6.87923939e+01 -4.53673706e+01
  1.31254137e+01 -1.71461057e+01 -1.67309850e+02 -5.56041464e+00
  1.08794194e+02 -1.70220559e+01 -2.58194596e+01 -3.55649299e+01
 -1.17187874e+02  2.76867068e+01 -4.03781165e+01  2.18546113e+02
 -2.76867990e+01 -3.76119998e+01  1.02646481e+01 -1.02087991

In [138]:
print("\n\nLASSO")
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.001, normalize=True)
lasso.fit(X_train, Price_train)
print(lasso)
print("Lasso coeffs:", lasso.coef_)
print("Lasso intercept:", lasso.intercept_)
Price_predictions = lasso.predict(X_test)
print('Lasso R squared:', lasso.score(X_test, Price_test))
print(Price_predictions[:10])
print(Price_test[:10])



LASSO
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Lasso coeffs: [ 8.33795227e-08 -0.00000000e+00 -4.85060130e+02 -2.36174695e-01
 -1.76744017e-01 -3.63874399e+00 -1.98335404e-01  1.93665123e-01
 -0.00000000e+00  4.05928250e+01  4.97312772e+00 -1.16249805e+02
 -6.00983328e+01  2.03671082e+01  8.69353981e+01 -3.49765081e+00
 -7.42903079e+01  6.51663327e+01 -9.99412221e+00  4.84117570e+01
 -1.42419247e+01 -9.98047146e-01  1.19993030e+02  3.29340139e+01
 -1.23793388e+00  1.47510906e+02  5.97899012e+01  0.00000000e+00
 -6.41281658e+01 -2.59311584e+01  1.82794407e+01 -6.16757163e+01
  1.30671393e+02  4.00058353e+01 -4.09573369e+01 -3.05023288e+01
  4.20364698e+01 -7.71894155e+00 -1.06968721e+01  0.00000000e+00
  6.16642811e+01 -1.30564127e+01  4.63469615e+00 -2.13352821e+01
  0.00000000e+00  4.20042275e+01 -2.34149517e+01  1.84679972e+02
 -4.01

In [139]:
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

clf_tree = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf_tree.fit(X_train, Price_train)
print(clf_tree)
tree_predictions = clf_tree.predict(X_test)
print(confusion_matrix(Price_test, tree_predictions))
print("f1:", f1_score(Price_test, tree_predictions, average=None))
print("f1:", f1_score(Price_test, tree_predictions, average='micro'))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
f1: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.09090909
 0.         0.07462687 0.         0.         0.         0.
 0.10666667 0.         0.         0.         0.         0.
 0.         0.1025641  0.         0.         0.05235602 0.
 0.         0.13675214 0.         0.16212565 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.08333333 0.         0.         0.         0.
 0.04255319 0.         0.         0.         0

  'precision', 'predicted', average, warn_for)


In [85]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=50, criterion='entropy')
clf_rf.fit(X_train, Price_train)
print(clf_rf)
rf_predictions = clf_rf.predict(X_test)
print(confusion_matrix(Price_test, rf_predictions))
print("f1:", f1_score(Price_test, rf_predictions, average='micro'))

MemoryError: could not allocate 321912832 bytes

In [140]:
##### KNN #####

from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(X_train, Price_train)
print(clf_knn)
#knn_predictions_train = clf_knn.predict(Features_train)
knn_predictions_test = clf_knn.predict(X_test)
#print("Training CM:")
#print(confusion_matrix(Quality_train, knn_predictions_train))
#print("Training f1:", f1_score(Quality_train, knn_predictions_train, average=None))
print("Testing CM:")
print(confusion_matrix(Price_test, knn_predictions_test))
print("Testing f1:", f1_score(Price_test, knn_predictions_test, average='micro'))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
Testing CM:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Testing f1: 0.057499488438714955


In [141]:
#### NAIVE BAYES #####

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, Price_train)
print(gnb)
gnb_predictions = gnb.predict(X_test)
print(confusion_matrix(Price_test, gnb_predictions))
print("Testing f1:", f1_score(Price_test, gnb_predictions, average="micro"))

GaussianNB(priors=None, var_smoothing=1e-09)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Testing f1: 0.04228906623013437
