# New York City Airbnb

## Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing  import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import tree
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor
import graphviz
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

## Data preparation

### Reading dataset

In [2]:
data_df = pd.read_csv("AB_NYC_2019.csv")
data_df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [123]:
data_df.columns

Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [121]:
data_df.head

<bound method NDFrame.head of                                                     name      host_name  \
0                     Clean & quiet apt home by the park           John   
1                                  Skylit Midtown Castle       Jennifer   
2                    THE VILLAGE OF HARLEM....NEW YORK !      Elisabeth   
3                        Cozy Entire Floor of Brownstone    LisaRoxanne   
4       Entire Apt: Spacious Studio/Loft by central park          Laura   
...                                                  ...            ...   
48890    Charming one bedroom - newly renovated rowhouse        Sabrina   
48891      Affordable room in Bushwick/East Williamsburg        Marisol   
48892            Sunny Studio at Historical Neighborhood  Ilgar & Aysel   
48893               43rd St. Time Square-cozy single bed            Taz   
48894  Trendy duplex in the very heart of Hell's Kitchen     Christophe   

      neighbourhood_group       neighbourhood  latitude  longitude  \

In [122]:
data_df.dtypes

name                               object
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [7]:
len(data_df["name"].unique())

47906

In [8]:
len(data_df["room_type"].unique())

3

In [9]:
data_df["reviews_per_month"].unique()

array([2.100e-01, 3.800e-01,       nan, 4.640e+00, 1.000e-01, 5.900e-01,
       4.000e-01, 3.470e+00, 9.900e-01, 1.330e+00, 4.300e-01, 1.500e+00,
       1.340e+00, 9.100e-01, 2.200e-01, 1.200e+00, 1.720e+00, 2.120e+00,
       4.440e+00, 7.000e-02, 1.090e+00, 3.700e-01, 6.100e-01, 7.300e-01,
       1.370e+00, 4.900e-01, 1.110e+00, 2.400e-01, 2.040e+00, 1.420e+00,
       1.650e+00, 2.370e+00, 6.600e-01, 1.410e+00, 1.960e+00, 1.810e+00,
       2.080e+00, 3.900e-01, 2.300e-01, 6.900e-01, 8.400e-01, 2.250e+00,
       5.200e-01, 1.160e+00, 1.010e+00, 6.300e-01, 7.000e-01, 2.820e+00,
       9.000e-01, 1.700e-01, 2.490e+00, 1.190e+00, 3.000e-01, 1.200e-01,
       5.700e-01, 1.600e-01, 4.720e+00, 1.400e+00, 1.260e+00, 1.640e+00,
       1.600e+00, 9.200e-01, 2.000e-01, 1.280e+00, 5.400e-01, 6.200e-01,
       1.500e-01, 5.300e-01, 1.730e+00, 5.000e-02, 1.540e+00, 2.800e-01,
       3.400e+00, 1.570e+00, 1.050e+00, 7.100e-01, 1.100e-01, 2.700e-01,
       1.230e+00, 8.700e-01, 2.090e+00, 6.000e-01, 

### Cleaning dataset

In [3]:
data_df.describe(include="all")

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48879,48895.0,48874,48895,48895,48895.0,48895.0,48895,48895.0,48895.0,48895.0,38843,38843.0,48895.0,48895.0
unique,,47905,,11452,5,221,,,3,,,,1764,,,
top,,Hillside Hotel,,Michael,Manhattan,Williamsburg,,,Entire home/apt,,,,2019-06-23,,,
freq,,18,,417,21661,3920,,,25409,,,,1413,,,
mean,19017140.0,,67620010.0,,,,40.728949,-73.95217,,152.720687,7.029962,23.274466,,1.373221,7.143982,112.781327
std,10983110.0,,78610970.0,,,,0.05453,0.046157,,240.15417,20.51055,44.550582,,1.680442,32.952519,131.622289
min,2539.0,,2438.0,,,,40.49979,-74.24442,,0.0,1.0,0.0,,0.01,1.0,0.0
25%,9471945.0,,7822033.0,,,,40.6901,-73.98307,,69.0,1.0,1.0,,0.19,1.0,0.0
50%,19677280.0,,30793820.0,,,,40.72307,-73.95568,,106.0,3.0,5.0,,0.72,1.0,45.0
75%,29152180.0,,107434400.0,,,,40.763115,-73.936275,,175.0,5.0,24.0,,2.02,2.0,227.0


In [4]:
data_df.drop(columns=["id"],inplace=True)
data_df.drop(columns=["host_id"],inplace=True)

In [74]:
data_df["last_review"].unique()

array(['2018-10-19', '2019-05-21', nan, ..., '2017-12-23', '2018-01-29',
       '2018-03-29'], dtype=object)

In [12]:
data_df["reviews_per_month"].unique()

array([2.100e-01, 3.800e-01,       nan, 4.640e+00, 1.000e-01, 5.900e-01,
       4.000e-01, 3.470e+00, 9.900e-01, 1.330e+00, 4.300e-01, 1.500e+00,
       1.340e+00, 9.100e-01, 2.200e-01, 1.200e+00, 1.720e+00, 2.120e+00,
       4.440e+00, 7.000e-02, 1.090e+00, 3.700e-01, 6.100e-01, 7.300e-01,
       1.370e+00, 4.900e-01, 1.110e+00, 2.400e-01, 2.040e+00, 1.420e+00,
       1.650e+00, 2.370e+00, 6.600e-01, 1.410e+00, 1.960e+00, 1.810e+00,
       2.080e+00, 3.900e-01, 2.300e-01, 6.900e-01, 8.400e-01, 2.250e+00,
       5.200e-01, 1.160e+00, 1.010e+00, 6.300e-01, 7.000e-01, 2.820e+00,
       9.000e-01, 1.700e-01, 2.490e+00, 1.190e+00, 3.000e-01, 1.200e-01,
       5.700e-01, 1.600e-01, 4.720e+00, 1.400e+00, 1.260e+00, 1.640e+00,
       1.600e+00, 9.200e-01, 2.000e-01, 1.280e+00, 5.400e-01, 6.200e-01,
       1.500e-01, 5.300e-01, 1.730e+00, 5.000e-02, 1.540e+00, 2.800e-01,
       3.400e+00, 1.570e+00, 1.050e+00, 7.100e-01, 1.100e-01, 2.700e-01,
       1.230e+00, 8.700e-01, 2.090e+00, 6.000e-01, 

In [59]:
data_df["name"].unique()

array(['Clean & quiet apt home by the park', 'Skylit Midtown Castle',
       'THE VILLAGE OF HARLEM....NEW YORK !', ...,
       'Sunny Studio at Historical Neighborhood',
       '43rd St. Time Square-cozy single bed',
       "Trendy duplex in the very heart of Hell's Kitchen"], dtype=object)

### Data selection

In [5]:
data_sel=data_df.sample(frac=0.05, replace=True, random_state=1, axis=0)

In [6]:
data_sel

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
33003,"Bliss-Private room close to train, ROOFTOP ter...",Viviane,Brooklyn,Prospect-Lefferts Gardens,40.66357,-73.94509,Private room,60,1,59,2019-07-01,4.65,3,342
12172,4 Bedroom Downtown Brooklyn,Aaron,Brooklyn,Clinton Hill,40.68460,-73.96536,Entire home/apt,330,2,84,2019-06-23,2.03,1,327
5192,Williamsburg Apt. w/ private patio,Sofia,Brooklyn,Williamsburg,40.71623,-73.96415,Private room,115,1,78,2019-07-01,1.31,1,8
32511,West Village Pied-à-Terre,Robert,Manhattan,West Village,40.73308,-74.00456,Entire home/apt,157,2,4,2018-09-09,0.30,1,0
43723,Dreamy Private Bedroom in Prime NYC Location (...,Eyal,Manhattan,Upper West Side,40.80037,-73.95940,Private room,60,30,0,,,32,311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5314,"2 bedroom with yard, 4 mins to LGA",Telmo,Queens,East Elmhurst,40.76287,-73.88367,Entire home/apt,120,3,102,2019-06-04,1.73,1,300
7585,Lovely Brooklyn Room & Private Bath,Rachel,Brooklyn,Gowanus,40.68202,-73.98151,Private room,125,2,40,2019-06-13,0.77,1,295
26425,"Sunny, spacious room in Bedstuy",Tiffany,Brooklyn,Bedford-Stuyvesant,40.68873,-73.93881,Private room,58,1,118,2019-06-20,5.48,1,21
5381,Private room,Deborah,Bronx,Longwood,40.81321,-73.90259,Private room,100,3,2,2019-01-02,0.22,2,207


### Split dataset

In [7]:
y=data_sel["price"]

In [8]:
data_sel.drop(columns=["price"],inplace=True)

In [9]:
X=data_sel

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.20, random_state=42)

In [11]:
X_train

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
48321,Stream-Pressed Paradise (Laundry Room Setup),Rita,Staten Island,Concord,40.60110,-74.07830,Shared room,2,0,,,4,82
23847,Spacious and Bright 1.5 Bedroom by Prospect Park,Jonathan,Brooklyn,Kensington,40.64676,-73.97403,Entire home/apt,13,6,2019-02-22,0.25,1,37
30490,"Great Home&Host, next to 1 train",Dee,Manhattan,Harlem,40.82325,-73.95243,Private room,14,12,2018-12-01,0.82,4,0
19213,LUXURY SUTTON PLACE RESIDENCY~DOORMAN/GYM/ELEV...,Ruchi,Manhattan,Midtown,40.75936,-73.96293,Entire home/apt,30,0,,,49,364
42371,"Cozy, spacious room in Bed-Stuy",Kelly,Brooklyn,Bedford-Stuyvesant,40.68845,-73.94390,Private room,1,7,2019-06-10,2.56,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19438,"Private Studio, Oceanside beautiful and safe area",Gennady + Laura,Brooklyn,Manhattan Beach,40.57821,-73.94122,Entire home/apt,4,91,2019-06-28,2.82,2,215
35451,Fun Leffert's Gardens Suite,Jonathan,Brooklyn,Prospect-Lefferts Gardens,40.66068,-73.95040,Private room,1,31,2019-05-09,3.06,1,0
36422,Clinton Hill 1 Bedroom Brooklyn heart of it all!,Thomas,Brooklyn,Clinton Hill,40.68313,-73.96742,Entire home/apt,5,3,2019-04-11,0.33,2,89
40424,Jfk crash pad 1-2persons in SHARED space,Lakshmee,Queens,Jamaica,40.66715,-73.78346,Shared room,1,65,2019-07-07,10.60,8,320


In [21]:
y_train

48321     30
23847     93
30490     46
19213    250
42371     55
        ... 
19438     55
35451     99
36422    200
40424     39
3122     550
Name: price, Length: 1956, dtype: int64

## Machine Learning Alg.

In [13]:
data_sel.dtypes

name                               object
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [12]:
numeric_features=["latitude","longitude","minimum_nights","number_of_reviews","reviews_per_month","calculated_host_listings_count","availability_365"]
nominal_features=["name","host_name","neighbourhood_group","neighbourhood","room_type","last_review"]

In [13]:
#AICI FACE SI PARTE DE CLEANING DATASET
preprocessor = ColumnTransformer([     
    ("num", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),                        
                      ("scaler", StandardScaler())]), numeric_features),     
    ("nom", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),                       
                      ("binarizer", OneHotEncoder(handle_unknown="ignore"))]), nominal_features)],     
    remainder="drop")

In [14]:
preprocessor.fit(X_train)
X_train=preprocessor.transform(X_train)

In [15]:
X_test=preprocessor.transform(X_test)

### Linear Regression

In [16]:
linreg=LinearRegression()
linreg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
scores=cross_val_score(linreg,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.13237856670158235

In [18]:
y_pred=linreg.predict(X_test)

In [19]:
r2_score(y_test,y_pred)

0.042633992782138086

### Decision Tree

In [20]:
X1=data_sel.copy()

In [21]:
X1.drop(columns=["neighbourhood_group"],inplace=True)
X1.drop(columns=["neighbourhood"],inplace=True)
X1.drop(columns=["room_type"],inplace=True)
X1.drop(columns=["last_review"],inplace=True)
X1.drop(columns=["name"],inplace=True)
X1.drop(columns=["host_name"],inplace=True)
X_train1,X_test1,y_train1,y_test1=train_test_split(X1,y, test_size=0.20, random_state=42)
preprocessor1 = ColumnTransformer([     
    ("num", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),                        
                      ("scaler", StandardScaler())]), numeric_features)],
    remainder="passthrough"
    )
preprocessor1.fit(X_train1)
X_train1=preprocessor1.transform(X_train1)
X_test1=preprocessor1.transform(X_test1)

In [22]:
clf=tree.DecisionTreeRegressor()
clf.fit(X_train1,y_train1)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [23]:
scores=cross_val_score(clf,X_train1,y_train1,scoring="r2", cv=5)
np.mean(scores)

-0.9837772288576939

In [24]:
clf_depth=tree.DecisionTreeRegressor(max_depth=3)
clf_depth.fit(X_train1,y_train1)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [25]:
scores=cross_val_score(clf_depth,X_train1,y_train1,scoring="r2", cv=5)
np.mean(scores)

0.12191035804007921

In [67]:
y_pred=clf_depth.predict(X_test1)
r2_score(y_test1, y_pred)

0.1594205675889433

In [26]:
dot_data = tree.export_graphviz(clf_depth, out_file = "tree2.dot",  feature_names=X1.columns)
graph2 = graphviz.Source(dot_data)
!dot -Tpng tree2.dot -o tree2.png

### Support Vector Machine

In [27]:
regr = svm.SVR()
regr.fit(X_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [28]:
scores=cross_val_score(regr,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.06668643952748284

In [29]:
regr1 = svm.SVR(kernel='linear',C=100)
regr1.fit(X_train,y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [30]:
scores=cross_val_score(regr1,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.2981083052719263

In [31]:
y_pred=regr1.predict(X_test)

In [32]:
r2_score(y_test, y_pred)

0.43674000148232883

### K-Nearest-Neighbour

In [33]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='uniform')

In [34]:
scores=cross_val_score(neigh,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

-0.12237906999362638

In [35]:
neigh1 = KNeighborsRegressor(n_neighbors=10)
neigh1.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                    weights='uniform')

In [36]:
scores=cross_val_score(neigh1,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.28029316717340896

In [37]:
y_pred=neigh1.predict(X_test)

In [38]:
r2_score(y_test, y_pred)

0.22588109214802599

### Multi-Layer Perceptron

In [39]:
mlp = MLPRegressor(random_state=1, max_iter=100)
mlp.fit(X_train, y_train)



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=100,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [40]:
scores=cross_val_score(mlp,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)



0.20943439431848718

In [41]:
mlp1 = MLPRegressor(random_state=1, max_iter=100,hidden_layer_sizes=(32,32,32))
mlp1.fit(X_train, y_train)



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(32, 32, 32), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=100,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [43]:
scores=cross_val_score(mlp1,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)



0.04122786261813431

In [104]:
y_pred=mlp.predict(X_test)

In [105]:
r2_score(y_test, y_pred)

0.3092284911538161

### Ensemble Learning:Voting Regressor

In [44]:
ereg = VotingRegressor(estimators=[('gb', linreg), ('rf', regr), ('lr', neigh)])
ereg = ereg.fit(X_train, y_train)  

In [45]:
scores=cross_val_score(ereg,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.23172616584956734

In [46]:
y_pred=ereg.predict(X_test)

In [47]:
r2_score(y_test, y_pred)

0.2951452836455678