In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression, ridge_regression
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data = pd.read_csv("AB_NYC_2019.csv")

In [3]:
new_data = data[['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'minimum_nights',
                 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
                 'availability_365', 'price']].copy()

In [4]:
new_data.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Brooklyn,Private room,40.64749,-73.97237,1,9,0.21,6,365,149
1,Manhattan,Entire home/apt,40.75362,-73.98377,1,45,0.38,2,355,225
2,Manhattan,Private room,40.80902,-73.9419,3,0,,1,365,150
3,Brooklyn,Entire home/apt,40.68514,-73.95976,1,270,4.64,1,194,89
4,Manhattan,Entire home/apt,40.79851,-73.94399,10,9,0.1,1,0,80


# Question 1

In [5]:
new_data['neighbourhood_group'].mode()

0    Manhattan
dtype: object

Most common neighbourhood is Manhattan

# Question 2

In [6]:
new_data.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
latitude,1.0,0.084788,0.024869,-0.015389,-0.010142,0.019517,-0.010983,0.033939
longitude,0.084788,1.0,-0.062747,0.059094,0.145948,-0.114713,0.082731,-0.150019
minimum_nights,0.024869,-0.062747,1.0,-0.080116,-0.121702,0.12796,0.144303,0.042799
number_of_reviews,-0.015389,0.059094,-0.080116,1.0,0.549868,-0.072376,0.172028,-0.047954
reviews_per_month,-0.010142,0.145948,-0.121702,0.549868,1.0,-0.009421,0.185791,-0.030608
calculated_host_listings_count,0.019517,-0.114713,0.12796,-0.072376,-0.009421,1.0,0.225701,0.057472
availability_365,-0.010983,0.082731,0.144303,0.172028,0.185791,0.225701,1.0,0.081829
price,0.033939,-0.150019,0.042799,-0.047954,-0.030608,0.057472,0.081829,1.0


In [7]:
c = new_data.corr().abs()

s = c.unstack()
so = s.sort_values(kind="quicksort", ascending = False)

In [8]:
so[8:]

number_of_reviews               reviews_per_month                 0.549868
reviews_per_month               number_of_reviews                 0.549868
availability_365                calculated_host_listings_count    0.225701
calculated_host_listings_count  availability_365                  0.225701
availability_365                reviews_per_month                 0.185791
reviews_per_month               availability_365                  0.185791
number_of_reviews               availability_365                  0.172028
availability_365                number_of_reviews                 0.172028
longitude                       price                             0.150019
price                           longitude                         0.150019
reviews_per_month               longitude                         0.145948
longitude                       reviews_per_month                 0.145948
minimum_nights                  availability_365                  0.144303
availability_365         

The most correlated pair is number of reviews and reviews per month at 0.54

Nothing technically qualifies as a strong correlation as there are no values > 0.6

In [9]:
new_data['price']

0        149
1        225
2        150
3         89
4         80
        ... 
48890     70
48891     40
48892    115
48893     55
48894     90
Name: price, Length: 48895, dtype: int64

In [10]:
new_data['above_average'] = new_data['price'].apply(lambda x: 1 if x >= new_data['price'].mean() else 0)

New column made with a binary outcome for price

# Question 3

In [11]:
new_data.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
price                               int64
above_average                       int64
dtype: object

In [12]:
X1 = new_data.drop(["price", "above_average"], axis = 1).copy()
X1.fillna(0, inplace = True)
y1 = new_data["above_average"].copy()

X_train1, X_test1, y_train1, y_test1 = tts(X1, y1, test_size = 0.4, random_state = 42)

X_test1, X_val1, y_test1, y_val1 = tts(X_test1, y_test1, test_size = 0.5, random_state = 42)

In [13]:
round(mutual_info_score(X_train1['neighbourhood_group'], y_train1), 2)

0.05

In [14]:
round(mutual_info_score(X_train1['room_type'], y_train1), 2)

0.14

Room type has the bigger mutual info score with the binarized price variable

# Question 4

In [15]:
X = new_data.drop(["price", "above_average"], axis = 1).copy()
X.fillna(0, inplace = True)
X = pd.get_dummies(X, drop_first = True)
y = new_data["above_average"].copy()

X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.4, random_state = 42)

X_test, X_val, y_test, y_val = tts(X_test, y_test, test_size = 0.5, random_state = 42)

In [16]:
model = LogisticRegression(solver = 'lbfgs', C = 1.0, random_state = 42, max_iter = 10000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [17]:
accuracy_og = round(accuracy_score(y_val, model.predict(X_val)), 4)
accuracy_og

0.7992

# Question 5

In [18]:
for i in list(zip(X_train.columns, model.coef_[0].round(3))):
    print(i)

('latitude', -6.265)
('longitude', -3.438)
('minimum_nights', -0.012)
('number_of_reviews', -0.004)
('reviews_per_month', -0.027)
('calculated_host_listings_count', 0.004)
('availability_365', 0.003)
('neighbourhood_group_Brooklyn', 0.166)
('neighbourhood_group_Manhattan', 1.653)
('neighbourhood_group_Queens', 0.042)
('neighbourhood_group_Staten Island', -1.507)
('room_type_Private room', -2.762)
('room_type_Shared room', -2.992)


In [19]:
X2 = new_data.drop(["price", "above_average"], axis = 1).copy()
X2.fillna(0, inplace = True)
X2 = pd.get_dummies(X, drop_first = True)
y2 = new_data["above_average"].copy()

In [20]:
col_accuracy = {}

for col in X2.columns:
    X3 = X2.drop(col, axis = 1).copy()
    X_train3, X_test3, y_train3, y_test3 = tts(X3, y2, test_size = 0.4, random_state = 42)
    X_test3, X_val3, y_test3, y_val3 = tts(X_test3, y_test3, test_size = 0.5, random_state = 42)
    model3 = LogisticRegression(solver = 'lbfgs', C = 1.0, random_state = 42, max_iter = 10000)
    model3.fit(X_train3, y_train3)
    col_accuracy[col] = round(accuracy_score(y_val3, model3.predict(X_val3)), 4)

In [21]:
col_accuracy

{'latitude': 0.7943,
 'longitude': 0.794,
 'minimum_nights': 0.8007,
 'number_of_reviews': 0.8002,
 'reviews_per_month': 0.7994,
 'calculated_host_listings_count': 0.7989,
 'availability_365': 0.7894,
 'neighbourhood_group_Brooklyn': 0.799,
 'neighbourhood_group_Manhattan': 0.8006,
 'neighbourhood_group_Queens': 0.7995,
 'neighbourhood_group_Staten Island': 0.7992,
 'room_type_Private room': 0.7361,
 'room_type_Shared room': 0.793}

In [22]:
for key in col_accuracy.keys():
    print(f'{key}: {accuracy_og - col_accuracy[key]}')

latitude: 0.0049000000000000155
longitude: 0.005199999999999982
minimum_nights: -0.0014999999999999458
number_of_reviews: -0.0010000000000000009
reviews_per_month: -0.00019999999999997797
calculated_host_listings_count: 0.00029999999999996696
availability_365: 0.009800000000000031
neighbourhood_group_Brooklyn: 0.00019999999999997797
neighbourhood_group_Manhattan: -0.0013999999999999568
neighbourhood_group_Queens: -0.00029999999999996696
neighbourhood_group_Staten Island: 0.0
room_type_Private room: 0.06310000000000004
room_type_Shared room: 0.006199999999999983


Smallest difference between a feature elimination and original accuracy is when the neighbourhood variables are dropped.

# Question 6

In [23]:
def RMSE(y, y_pred):
    return round(np.mean((y_pred - y)**2)**0.5, 5)

In [24]:
alphas = [0, 0.01, 0.1, 1, 10]

In [25]:
new_data2 = data[['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'minimum_nights',
                 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
                 'availability_365', 'price']].copy().fillna(0)

In [26]:
X5 = new_data2.drop(["price"], axis = 1).copy()
X5 = pd.get_dummies(X5, drop_first = True)
y5 = new_data2["price"].copy()
y5 = np.log1p(y5)

X_train5, X_test5, y_train5, y_test5 = tts(X5, y5, test_size = 0.4, random_state = 42)

X_test5, X_val5, y_test5, y_val5 = tts(X_test5, y_test5, test_size = 0.5, random_state = 42)

In [27]:
alpha_comp = {}

for alph in alphas:
    ridge_model, intercept = ridge_regression(X_train5, y_train5, alpha = alph, return_intercept = True)
    y_pred_val5 = X_val5.dot(ridge_model) + intercept
    alpha_comp[alph] = RMSE(y_val5, y_pred_val5)

In [28]:
alpha_comp

{0: 0.52799, 0.01: 0.52799, 0.1: 0.52799, 1: 0.52799, 10: 0.52803}

Everything aside from alpha = 10 seems to equally good