In [2]:
import pandas as pd
import numpy as np

In [114]:
df = pd.read_csv("Airbnb_NYC_2019.csv")

In [115]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [116]:
df.shape

(48895, 16)

### Dealing with missing values

In [117]:
# check missing values 
df.isna().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

we can drop missing values for host_id, neighbor_hood group since they are relatively small amount compared to the total amount of datasets. 
As for last_review and reviews_per_month, we need to first check the porportion in the total dataset to see the percentage and decide to drop or to impute. 

In [118]:
df.isna().sum()/df.shape[0]

id                                0.000000
name                              0.000327
host_id                           0.000000
host_name                         0.000429
neighbourhood_group               0.000000
neighbourhood                     0.000000
latitude                          0.000000
longitude                         0.000000
room_type                         0.000000
price                             0.000000
minimum_nights                    0.000000
number_of_reviews                 0.000000
last_review                       0.205583
reviews_per_month                 0.205583
calculated_host_listings_count    0.000000
availability_365                  0.000000
dtype: float64

In [119]:
# check data values for last_review
df[['last_review']].head()

Unnamed: 0,last_review
0,2018-10-19
1,2019-05-21
2,
3,2019-07-05
4,2018-11-19


In [120]:
df[['reviews_per_month']].head()

Unnamed: 0,reviews_per_month
0,0.21
1,0.38
2,
3,4.64
4,0.1


The empty values for name, host_name, and last reviews can be dropped, since they seem non-menaingful to impute. We can replace the empty values for reviews per month with 0 values, becuase this means there is no review per month. 

In [121]:
df.dropna(subset = ['name', 'host_name', 'last_review'], inplace = True)

In [122]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0.00)

In [123]:
#check missing values again 
df.isna().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [124]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0
mean,18100810.0,64245820.0,40.728129,-73.951149,142.332526,5.86922,29.290255,1.373229,5.166611,114.886299
std,10693720.0,75897520.0,0.054991,0.046693,196.994756,17.389026,48.1829,1.680328,26.302954,129.52995
min,2539.0,2438.0,40.50641,-74.24442,0.0,1.0,1.0,0.01,1.0,0.0
25%,8721444.0,7029525.0,40.68864,-73.98246,69.0,1.0,3.0,0.19,1.0,0.0
50%,18872860.0,28370920.0,40.72171,-73.95481,101.0,2.0,9.0,0.72,1.0,55.0
75%,27567460.0,101890500.0,40.76299,-73.93502,170.0,4.0,33.0,2.02,2.0,229.0
max,36455810.0,273841700.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


what are some components that need to take into considerations for house price?

geography, minimum_nights, number of reviews, reviews per month, calculated_host_listings_count	availability_365. Thus we can exclude id, host_id from our considerations for training data. 

In [125]:
#generate training data 
# drop unrelated information
# neighborhood has the same information as latitude and longitude, thus enighborhood can be dropped
train_df = df.drop(['id','name', 'host_name', 'host_id','neighbourhood','last_review' ], axis=1)

In [126]:
train_df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0
5,Manhattan,40.74767,-73.975,Entire home/apt,200,3,74,0.59,1,129


In [127]:
#check number of unique values in each columne to decide what processing technique to use 
train_df.nunique()

neighbourhood_group                   5
latitude                          17436
longitude                         13639
room_type                             3
price                               581
minimum_nights                       89
number_of_reviews                   393
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [128]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [129]:
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

Based on the number of unique values and data type for each column. We can apply the following encoding method for text preprocessing: 

1. one hot encoding for neighbor group
2. create grouping for latitude and longitude first? then encode?
3.label encode for room type since size matters
4. conduct normalization/ standardization for all continuous data

In [130]:
#exclude label
train_df.drop(['price'], axis = 1, inplace= True)

In [131]:
train_df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,1,45,0.38,2,355
3,Brooklyn,40.68514,-73.95976,Entire home/apt,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,10,9,0.1,1,0
5,Manhattan,40.74767,-73.975,Entire home/apt,3,74,0.59,1,129


### Encoding categorical data

In [39]:
pip install python-geohash

Collecting python-geohash
  Downloading python-geohash-0.8.5.tar.gz (17 kB)
Building wheels for collected packages: python-geohash
  Building wheel for python-geohash (setup.py) ... [?25ldone
[?25h  Created wheel for python-geohash: filename=python_geohash-0.8.5-cp37-cp37m-macosx_10_7_x86_64.whl size=17781 sha256=d0af826f6de750d9657c4ded3de20309c594cc93076c7e5ec35dfe0e24e7e948
  Stored in directory: /Users/cookiepoon/Library/Caches/pip/wheels/ea/62/7a/e8b943f1d8025cd93a93928a162319e56843301c8c06610ffe
Successfully built python-geohash
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5
Note: you may need to restart the kernel to use updated packages.


In [132]:
# create geohash code for geographical data 
import geohash as gh
train_df['geohash']=train_df.apply(lambda x: gh.encode(x['latitude'], x['longitude'], precision=5), axis=1)

In [133]:
train_df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,geohash
0,Brooklyn,40.64749,-73.97237,Private room,1,9,0.21,6,365,dr5rh
1,Manhattan,40.75362,-73.98377,Entire home/apt,1,45,0.38,2,355,dr5ru
3,Brooklyn,40.68514,-73.95976,Entire home/apt,1,270,4.64,1,194,dr5rm
4,Manhattan,40.79851,-73.94399,Entire home/apt,10,9,0.1,1,0,dr72j
5,Manhattan,40.74767,-73.975,Entire home/apt,3,74,0.59,1,129,dr5ru


In [134]:
#drop latltitude longitude
train_df.drop(['latitude', 'longitude'], axis = 1, inplace= True)

In [135]:
train_df.head()

Unnamed: 0,neighbourhood_group,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,geohash
0,Brooklyn,Private room,1,9,0.21,6,365,dr5rh
1,Manhattan,Entire home/apt,1,45,0.38,2,355,dr5ru
3,Brooklyn,Entire home/apt,1,270,4.64,1,194,dr5rm
4,Manhattan,Entire home/apt,10,9,0.1,1,0,dr72j
5,Manhattan,Entire home/apt,3,74,0.59,1,129,dr5ru


In [136]:
train_df.geohash.nunique()
# there are 60 unique geographical location, should apply target encoding later

60

In [None]:
# one hot encode neighbourhood_group


In [None]:
# label encode based on room type: 'Shared room'<'Private room'<'Entire home/apt'


In [137]:
X = train_df.values.tolist()
y = df['price'].tolist()

In [138]:
#train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

26010
12811
26010
12811


In [139]:
#create train and test dataframe for target encoding later
df_train = pd.DataFrame(X_train)
df_test = pd.DataFrame(X_test)

In [140]:
df_train.iloc[:,7]

0        dr5rt
1        dr5x9
2        dr5rt
3        dr72m
4        dr5rv
         ...  
26005    dr5rm
26006    dr5ru
26007    dr5xc
26008    dr5rv
26009    dr5ru
Name: 7, Length: 26010, dtype: object

In [59]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 2.2 MB/s eta 0:00:01
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2
Note: you may need to restart the kernel to use updated packages.


In [141]:
# target encode on geolocations, since the amount of unique values are large
# if we look at price as a target, each row with the unique value of geolocation would be replaced with the average price for the house
import category_encoders as ce

encoder = ce.TargetEncoder(cols=[7], smoothing=0, return_df=True)

df_train['coded_geo'] = encoder.fit_transform(df_train.iloc[:,7], y_train)
df_test['coded_geo'] = encoder.transform(df_test.iloc[:,7])

  elif pd.api.types.is_categorical(cols):


In [142]:
df_train.drop(df_train_x.columns[7], axis=1, inplace= True)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Brooklyn,Private room,1,71,5.37,2,51,dr5rt
1,Queens,Shared room,1,11,0.51,2,125,dr5x9
2,Brooklyn,Private room,2,24,0.98,1,1,dr5rt
3,Manhattan,Entire home/apt,30,1,0.03,2,244,dr72m
4,Queens,Entire home/apt,1,56,2.73,1,126,dr5rv


In [143]:
df_test.drop(df_test.columns[7], axis=1, inplace= True)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,coded_geo
0,Brooklyn,Entire home/apt,2,31,0.54,2,12,121.544755
1,Brooklyn,Entire home/apt,3,10,0.65,5,156,111.745178
2,Brooklyn,Entire home/apt,2,147,1.89,1,27,111.745178
3,Brooklyn,Entire home/apt,3,92,1.45,1,248,111.745178
4,Brooklyn,Entire home/apt,3,64,1.38,1,296,111.745178


In [144]:
# concatenate train and test dataframes again for normalization or stanadardization
df_train['price'] = y_train
df_test['price'] = y_test
df_whole = pd.concat([df_train, df_test])

### Normalization and Standardization

Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks.

Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution. However, this does not have to be necessarily true. Also, unlike normalization, standardization does not have a bounding range. So, even if you have outliers in your data, they will not be affected by standardization.

In [None]:
# apply standarization or normalization on continuous values based on the data distribution

In [None]:
# after nromalization and standardization, split the data into train and test with the same proportion as before 

In [None]:
# correlation plot to decide variables 