In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
# New York City Airbnb Open Data
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv")

In [4]:
features = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [5]:
df_full = data[features].fillna(0)

In [6]:
df_full.head().T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,Brooklyn,Manhattan,Manhattan,Brooklyn,Manhattan
room_type,Private room,Entire home/apt,Private room,Entire home/apt,Entire home/apt
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
price,149,225,150,89,80
minimum_nights,1,1,3,1,10
number_of_reviews,9,45,0,270,9
reviews_per_month,0.21,0.38,0.0,4.64,0.1
calculated_host_listings_count,6,2,1,1,1
availability_365,365,355,365,194,0


In [7]:
# Question 1
df_full['neighbourhood_group'].mode()

0    Manhattan
dtype: object

In [8]:
df_full.groupby(['neighbourhood_group']).count()

Unnamed: 0_level_0,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,1091,1091,1091,1091,1091,1091,1091,1091,1091
Brooklyn,20104,20104,20104,20104,20104,20104,20104,20104,20104
Manhattan,21661,21661,21661,21661,21661,21661,21661,21661,21661
Queens,5666,5666,5666,5666,5666,5666,5666,5666,5666
Staten Island,373,373,373,373,373,373,373,373,373


In [9]:
# Data prep
df_full_train, df_test = train_test_split(df_full, test_size=0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [32]:
# Question 2
numerical = ['latitude',
'longitude',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']
df_corr = pd.DataFrame()
df_train_corr = df_train[numerical]
for col in df_train_corr.columns:
    df_corr[col] = df_train_corr.corrwith(df_train_corr[col])
df_corr

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [13]:
# Question 3
avg_price = 152
above_avg_train = (y_train >= avg_price).astype(int)
above_avg_val = (y_val >= avg_price).astype(int)
above_avg_test = (y_test >= avg_price).astype(int)

In [14]:
categorical = ['neighbourhood_group',
'room_type']
score_neighbour = mutual_info_score(above_avg_train, df_train[categorical[0]])
score_roomtype = mutual_info_score(above_avg_train, df_train[categorical[1]])
print(round(score_neighbour, 2))
print(round(score_roomtype, 2))

0.05
0.14


In [53]:
# Question 4
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [54]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter = 5000)

In [55]:
model.fit(X_train, above_avg_train)

LogisticRegression(max_iter=5000, random_state=42)

In [56]:
y_pred = model.predict_proba(X_val)[:, 1]

In [57]:
avg_price_pred = (y_pred >= 0.5)

In [61]:
score_original = (above_avg_val == avg_price_pred).mean()

In [63]:
# Question 5
columns = numerical + categorical

for i in range(0, len(columns)):
    c = columns.copy()
    removed_col = c.pop(i)
    
    # Train model without i-th element
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[c].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[c].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter = 5000)
    model.fit(X_train, above_avg_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    avg_price_pred = (y_pred >= 0.5)
    
    score = (above_avg_val == avg_price_pred).mean()
    score_diff = round(score_original - score, 6)
    print("Removed: {}".format(removed_col))
    print("Score: {}".format(score_diff))

Removed: latitude
Score: -0.058084
Removed: longitude
Score: -0.058288
Removed: minimum_nights
Score: -0.062174
Removed: number_of_reviews
Score: -0.062788
Removed: reviews_per_month
Score: -0.062072
Removed: calculated_host_listings_count
Score: -0.060947
Removed: availability_365
Score: -0.052562
Removed: neighbourhood_group
Score: -0.022293
Removed: room_type
Score: 0.0


In [70]:
# Question 6

# Data prep (copy from earlier)
df_full_train, df_test = train_test_split(df_full, test_size=0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)

In [78]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [72]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [85]:
alpha_list = [0, 0.01, 0.1, 1, 10]
for a in alpha_list:
    clf = Ridge(alpha=a)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    rmse = mean_squared_error(y_pred, y_val)
    print("Alpha: {} \t RMSE: {}".format(a, round(rmse, 4)))

Alpha: 0 	 RMSE: 0.2472
Alpha: 0.01 	 RMSE: 0.2471
Alpha: 0.1 	 RMSE: 0.2471
Alpha: 1 	 RMSE: 0.2471
Alpha: 10 	 RMSE: 0.2479
