In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
from pickle import dump
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, model_selection, ensemble, linear_model

In [2]:
df = pd.read_csv("../data/AB_NYC_2019.csv")
print("Dimensions of the data: " + str(df.shape[0]) + " and " + str(df.shape[1])) 

Dimensions of the data: 48895 and 16


# DATA ANALYSIS
### TODO : Visualization

In [3]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [4]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [5]:
df.dtypes
#Dependent variable: price

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

# FEATURE ENGINEERING
### ~~TODO: Last review can be modified to 3 sub category ~~
### ~~TODO: New feature about how old is the last review <-- other features gives enough data about it~~
### TODO: Handle NaN values

In [6]:
#Id, name, host_id, host_name are not important
df = df.drop(["id","name","host_id","host_name","latitude","longitude"],axis=1)


In [7]:
type_set=set()
for i in range(len(df.dtypes)):
    type_set.add(df.dtypes[i])
print(type_set)

{dtype('int64'), dtype('float64'), dtype('O')}


In [8]:
df.dropna(inplace=True)

In [9]:
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Kensington,Private room,149,1,9,2018-10-19,0.21,6,365
1,Manhattan,Midtown,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,Manhattan,East Harlem,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,Manhattan,Murray Hill,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [10]:
# If the number of reviews is 0, the review related columns will be NaN 
print(df[df["number_of_reviews"]==0]["last_review"].value_counts())
print(df[df["number_of_reviews"]==0]["reviews_per_month"].value_counts())

Series([], Name: last_review, dtype: int64)
Series([], Name: reviews_per_month, dtype: int64)


In [11]:
df["year"] = df["last_review"].str[:4]
df["year"] = df["year"].astype("int32")
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,year
0,Brooklyn,Kensington,Private room,149,1,9,2018-10-19,0.21,6,365,2018
1,Manhattan,Midtown,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,2019
3,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,2019
4,Manhattan,East Harlem,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,2018
5,Manhattan,Murray Hill,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,2019


In [12]:
# Init a new column
df["part_of_year"] = "Q"

month = df["last_review"].str[5:7]

q1=["01","02","03"]
q2=["04","05","06"]
q3=["07","08","09"]
q4=["10","11","12"]

q = [q1,q2,q3,q4]

# Update the column
for i in range(len(q)):
    df.loc[month.isin(q[i]), "part_of_year"] = "Q{}".format(i+1)
    
# Create review category    
df["review_bin"] = pd.qcut(df["number_of_reviews"],
                          q=[0,.2,.4,.6,.8,1])

In [13]:
df.shape

(38843, 13)

In [14]:
columns_to_change=["neighbourhood",'neighbourhood_group',"room_type","part_of_year","review_bin"]
columns_to_drop = ["last_review","calculated_host_listings_count",
                   "availability_365","reviews_per_month","number_of_reviews"]
columns_to_change.extend(columns_to_drop)

df.drop(columns_to_drop,axis=1,inplace=True, errors="ignore")

for i in range(len(columns_to_change)):
    try:
        df = pd.concat([df,pd.get_dummies(df[columns_to_change[i]], prefix=columns_to_change[i])],axis=1)
        print("Column {} has been removed from the df.".format(columns_to_change[i]))
    except KeyError:
        print("Column {} is not in the df (anymore).".format(columns_to_change[i]))

df.drop(columns_to_change,axis=1,errors='ignore', inplace=True)

df.head()

Column neighbourhood has been removed from the df.
Column neighbourhood_group has been removed from the df.
Column room_type has been removed from the df.
Column part_of_year has been removed from the df.
Column review_bin has been removed from the df.
Column last_review is not in the df (anymore).
Column calculated_host_listings_count is not in the df (anymore).
Column availability_365 is not in the df (anymore).
Column reviews_per_month is not in the df (anymore).
Column number_of_reviews is not in the df (anymore).


Unnamed: 0,price,minimum_nights,year,neighbourhood_Allerton,neighbourhood_Arden Heights,neighbourhood_Arrochar,neighbourhood_Arverne,neighbourhood_Astoria,neighbourhood_Bath Beach,neighbourhood_Battery Park City,...,room_type_Shared room,part_of_year_Q1,part_of_year_Q2,part_of_year_Q3,part_of_year_Q4,"review_bin_(0.999, 2.0]","review_bin_(2.0, 6.0]","review_bin_(6.0, 15.0]","review_bin_(15.0, 44.0]","review_bin_(44.0, 629.0]"
0,149,1,2018,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,225,1,2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,89,1,2019,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,80,10,2018,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,200,3,2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [15]:
df.shape

(38843, 238)

In [16]:
df.columns

Index(['price', 'minimum_nights', 'year', 'neighbourhood_Allerton',
       'neighbourhood_Arden Heights', 'neighbourhood_Arrochar',
       'neighbourhood_Arverne', 'neighbourhood_Astoria',
       'neighbourhood_Bath Beach', 'neighbourhood_Battery Park City',
       ...
       'room_type_Shared room', 'part_of_year_Q1', 'part_of_year_Q2',
       'part_of_year_Q3', 'part_of_year_Q4', 'review_bin_(0.999, 2.0]',
       'review_bin_(2.0, 6.0]', 'review_bin_(6.0, 15.0]',
       'review_bin_(15.0, 44.0]', 'review_bin_(44.0, 629.0]'],
      dtype='object', length=238)

In [17]:
df.head()

Unnamed: 0,price,minimum_nights,year,neighbourhood_Allerton,neighbourhood_Arden Heights,neighbourhood_Arrochar,neighbourhood_Arverne,neighbourhood_Astoria,neighbourhood_Bath Beach,neighbourhood_Battery Park City,...,room_type_Shared room,part_of_year_Q1,part_of_year_Q2,part_of_year_Q3,part_of_year_Q4,"review_bin_(0.999, 2.0]","review_bin_(2.0, 6.0]","review_bin_(6.0, 15.0]","review_bin_(15.0, 44.0]","review_bin_(44.0, 629.0]"
0,149,1,2018,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,225,1,2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,89,1,2019,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,80,10,2018,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,200,3,2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [18]:
type_set=set()
for i in range(len(df.dtypes)):
    type_set.add(df.dtypes[i])
print(type_set)

{dtype('int64'), dtype('int32'), dtype('uint8')}


In [19]:
x_data = df.drop("price",axis=1)
y_data = df["price"]
print(x_data.shape)
print(y_data.shape)

X_train, X_holdout, y_train, y_holdout = model_selection.train_test_split(x_data, 
                                                                          y_data,
                                                                          test_size=0.2,
                                                                          random_state=42)

print(X_train.shape)

(38843, 237)
(38843,)
(31074, 237)


In [20]:
names = X_train.columns
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=names)

# save the scaler


In [21]:
dump(scaler, open('scaler.pkl', 'wb'))

In [22]:
y_train.shape

(31074,)

In [23]:
X_train_scaled.shape

(31074, 237)

In [24]:
scores = []

# estimators = np.arange(10, 200, 10)
# scores = []
# for n in estimators:
#     model.set_params(n_estimators=n)

regressor = ensemble.RandomForestRegressor(n_estimators=100,random_state=42,verbose=1)

model = regressor.fit(X_train,y_train)

# cv = model_selection.KFold(n_splits=5,shuffle=False)

# for train_index, test_index in cv.split(X_train_scaled):
#     print("Train Index: ", train_index, "\n")
#     print("Test Index: ", test_index)

#     X_train, X_test = X_train_scaled.iloc[train_index], X_train_scaled.iloc[test_index]
#     Y_train = y_train.iloc[train_index]
#     Y_test = y_train.iloc[test_index]

    
#     model = regressor.fit(X_train, y_train)
    
#     scores.append(regressor.score(X_test, y_test))
    
#     print(regressor.score(X_test, y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   56.0s finished


In [25]:
model.score(X_holdout, y_holdout)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


0.037166647673508635

In [26]:
reg = linear_model.LinearRegression().fit(X_train,y_train)

In [27]:
reg.score(X_holdout,y_holdout)

-191834696127807.5