### Import Modules & Set-Up

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
# from sklearn.utils import shuffle

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [85]:
df = pd.read_csv('AB_NYC_2019.txt')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [86]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

### Features

For the rest of the homework, you'll need to use the features from the previous homework with additional two 'neighbourhood_group' and 'room_type'. So the whole feature set will be set as follows:

* 'neighbourhood_group',
* 'room_type',
* 'latitude',
* 'longitude',
* 'price',
* 'minimum_nights',
* 'number_of_reviews',
* 'reviews_per_month',
* 'calculated_host_listings_count',
* 'availability_365'

Select only them and fill in the missing values with 0.

In [87]:
cols = [
'neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365',
]

df = df[cols].copy().fillna(0)
df.head()
df_orig = df.copy()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,0.0,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [88]:
# Checking for nulls and NaNs:
df[df.isnull().any(axis=1)]
df[df.isna().any(axis=1)]

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365


Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365


### Question 1

What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [89]:
df.neighbourhood_group.mode()
# Checking:
df.neighbourhood_group.value_counts(ascending=False)

0    Manhattan
dtype: object

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

Manhattan is the mode value for neighbourhood group, which is to be expected.

### Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value ('price') is not in your dataframe.


In [90]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
# Keep next 3 price values for later
train_price = df_train.price
val_price = df_val.price
test_price = df_test.price


y_train = df_train.price.values
y_val = df_val.price.values
df_train_orig = df_train.copy()
del df_train['price']
del df_val['price']

# Checking size
len(df_train), len(df_val), len(df_test)
len(df_train) + len(df_val) + len(df_test) == len(df)

(29337, 9779, 9779)

True

### Question 2

* Create the correlation matrix for the numerical features of your train dataset.
* In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

In [91]:
corr = df_train.corr()
corr

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [92]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

  corr.style.background_gradient(cmap, axis=1)\


Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.08,0.03,-0.01,-0.01,0.02,-0.01
longitude,0.08,1.0,-0.06,0.06,0.13,-0.12,0.08
minimum_nights,0.03,-0.06,1.0,-0.08,-0.12,0.12,0.14
number_of_reviews,-0.01,0.06,-0.08,1.0,0.59,-0.07,0.17
reviews_per_month,-0.01,0.13,-0.12,0.59,1.0,-0.05,0.17
calculated_host_listings_count,0.02,-0.12,0.12,-0.07,-0.05,1.0,0.23
availability_365,-0.01,0.08,0.14,0.17,0.17,0.23,1.0


reviews_per_month	and number_of_reviews are the most highly correlated, therefore.

### Make price binary
We need to turn the price variable from numeric into binary.
Let's create a variable above_average which is 1 if the price is above (or equal to) 152

In [93]:
y_train.mean()
np.median(y_train)
df.price.mean()

above_average = np.array([1 if y >= 152 else 0 for y in (y_train)])
# Checking:

list(zip(above_average[:20], y_train[:20]))
df_train_orig.head(20)

154.12012134846782

106.0

152.7206871868289

[(0, 99),
 (0, 57),
 (0, 70),
 (0, 130),
 (0, 110),
 (0, 140),
 (0, 63),
 (0, 85),
 (0, 85),
 (0, 40),
 (0, 69),
 (0, 105),
 (0, 109),
 (0, 49),
 (1, 160),
 (1, 200),
 (0, 95),
 (0, 59),
 (1, 210),
 (0, 80)]

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
13575,Brooklyn,Entire home/apt,40.7276,-73.94495,99,3,29,0.7,13,50
48476,Manhattan,Private room,40.70847,-74.00498,57,1,0,0.0,1,7
44499,Bronx,Entire home/apt,40.83149,-73.92766,70,40,0,0.0,1,0
17382,Brooklyn,Entire home/apt,40.66448,-73.99407,130,2,3,0.08,1,0
14638,Manhattan,Private room,40.74118,-74.00012,110,1,48,1.8,2,67
42752,Manhattan,Entire home/apt,40.75183,-73.99633,140,30,0,0.0,1,167
35909,Brooklyn,Private room,40.69855,-73.91394,63,3,2,0.31,1,365
42678,Manhattan,Entire home/apt,40.82734,-73.94718,85,4,3,1.13,1,6
31337,Brooklyn,Private room,40.70473,-73.92069,85,3,33,2.47,1,7
44629,Queens,Private room,40.76176,-73.76511,40,31,0,0.0,1,69


### Question 3

* Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
* Which of these two variables has bigger score?
* Round it to 2 decimal digits using round(score, 2)

In [94]:
def mutual_info_price_score(series):
    return mutual_info_score(series, above_average)

In [95]:
categorical = [
'neighbourhood_group',
'room_type']

df_train[categorical].head()

Unnamed: 0,neighbourhood_group,room_type
13575,Brooklyn,Entire home/apt
48476,Manhattan,Private room
44499,Bronx,Entire home/apt
17382,Brooklyn,Entire home/apt
14638,Manhattan,Private room


In [96]:
mi = df_train[categorical].apply(mutual_info_price_score)
mi
round(mi.room_type, 2)

neighbourhood_group    0.046506
room_type              0.143226
dtype: float64

0.14

room_type has the greater mutual information score of 0.14

### Question 4

* Now let's train a logistic regression
* Remember that we have two categorical variables in the data. Include them using one-hot encoding.
* Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
* Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.


#### Training the Model

In [97]:
# Last check of training data set:
df_train.head(15)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
13575,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50
48476,Manhattan,Private room,40.70847,-74.00498,1,0,0.0,1,7
44499,Bronx,Entire home/apt,40.83149,-73.92766,40,0,0.0,1,0
17382,Brooklyn,Entire home/apt,40.66448,-73.99407,2,3,0.08,1,0
14638,Manhattan,Private room,40.74118,-74.00012,1,48,1.8,2,67
42752,Manhattan,Entire home/apt,40.75183,-73.99633,30,0,0.0,1,167
35909,Brooklyn,Private room,40.69855,-73.91394,3,2,0.31,1,365
42678,Manhattan,Entire home/apt,40.82734,-73.94718,4,3,1.13,1,6
31337,Brooklyn,Private room,40.70473,-73.92069,3,33,2.47,1,7
44629,Queens,Private room,40.76176,-73.76511,31,0,0.0,1,69


In [98]:
train_dict = df_train.to_dict(orient='records')
train_dict[0]
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
dv.get_feature_names()

{'neighbourhood_group': 'Brooklyn',
 'room_type': 'Entire home/apt',
 'latitude': 40.7276,
 'longitude': -73.94495,
 'minimum_nights': 3,
 'number_of_reviews': 29,
 'reviews_per_month': 0.7,
 'calculated_host_listings_count': 13,
 'availability_365': 50}

In [100]:
X_train = dv.transform(train_dict)
X_train.shape



(29337, 15)

In [101]:
len(X_train)
len(above_average)

29337

29337

In [102]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, above_average)
model.intercept_
model.coef_[0].round(3)

LogisticRegression(random_state=42, solver='liblinear')

array([-0.09158457])

array([ 3.000e-03,  4.000e-03, -5.818e+00, -3.166e+00, -1.100e-02,
       -8.200e-02,  1.250e-01,  1.576e+00, -2.900e-02, -1.681e+00,
       -3.000e-03, -4.200e-02,  1.957e+00, -8.200e-01, -1.228e+00])

In [103]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


In [104]:
model.predict(X_train)
# Soft predictions
model.predict_proba(X_train)
model.predict_proba(X_val)

array([0, 0, 0, ..., 0, 0, 1])

array([[0.68113862, 0.31886138],
       [0.85954909, 0.14045091],
       [0.89255644, 0.10744356],
       ...,
       [0.90364463, 0.09635537],
       [0.98220344, 0.01779656],
       [0.39466238, 0.60533762]])

array([[0.97121242, 0.02878758],
       [0.40418132, 0.59581868],
       [0.5737665 , 0.4262335 ],
       ...,
       [0.88581296, 0.11418704],
       [0.96542264, 0.03457736],
       [0.47158458, 0.52841542]])

In [105]:
# Hard prediction
y_pred = model.predict_proba(X_val)[:, 1]
above_avg_decision = (y_pred >= 0.5)
above_avg_decision
len(above_avg_decision)

array([False,  True, False, ..., False, False,  True])

9779

#### Determining Accuracy

In [106]:
y_val = np.where(y_val >= 152, 1, 0)
y_val
above_avg_decision.astype(int)
above_average.mean()
model_accuracy = (y_val == above_avg_decision).mean()
model_accuracy.round(2)

array([0, 0, 1, ..., 0, 0, 0])

array([0, 1, 0, ..., 0, 0, 1])

0.3058254081876129

0.79

Therefore the model has ~79% accuracy, and this is a substantial improvement on the baseline 31% for above average properties.

### Question 5

* We have 9 features: 7 numerical features and 2 categorical.
* Let's find the least useful one using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?
    * neighbourhood_group
    * room_type
    * number_of_reviews
    * reviews_per_month
* note: the difference doesn't have to be positive

#### Display coefficients

In [107]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

{'availability_365': 0.003,
 'calculated_host_listings_count': 0.004,
 'latitude': -5.818,
 'longitude': -3.166,
 'minimum_nights': -0.011,
 'neighbourhood_group=Bronx': -0.082,
 'neighbourhood_group=Brooklyn': 0.125,
 'neighbourhood_group=Manhattan': 1.576,
 'neighbourhood_group=Queens': -0.029,
 'neighbourhood_group=Staten Island': -1.681,
 'number_of_reviews': -0.003,
 'reviews_per_month': -0.042,
 'room_type=Entire home/apt': 1.957,
 'room_type=Private room': -0.82,
 'room_type=Shared room': -1.228}

#### Feature Elimination

In [108]:
# First we encapsulate model training in a function
def train_log_regression(train, val):
    train_dict = train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train, above_average)
    val_dict = val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    y_pred = model.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    return decision


In [109]:
# Then a function for accuracy
def check_accuracy(val, decision):
    return (val == decision).mean()

In [110]:
# Create results dict and initialise with all features
accuracy_dict = {}
accuracy_dict['all'] = check_accuracy(y_val, train_log_regression(df_train, df_val))
accuracy_dict

{'all': 0.7907761529808774}

In [111]:
# Check features:
features = cols.copy()
features.remove('price')
features


['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [112]:
for f in features:
    sel_cols = features.copy()
    sel_cols.remove(f)
    accuracy_dict[f"{f}_removed"] = check_accuracy(y_val, train_log_regression(df_train[sel_cols], df_val))

accuracy_dict

{'all': 0.7907761529808774,
 'neighbourhood_group_removed': 0.7498721750690255,
 'room_type_removed': 0.7299314858369976,
 'latitude_removed': 0.7868902750792515,
 'longitude_removed': 0.7869925350240311,
 'minimum_nights_removed': 0.7914919725943348,
 'number_of_reviews_removed': 0.7914919725943348,
 'reviews_per_month_removed': 0.7907761529808774,
 'calculated_host_listings_count_removed': 0.7896512935883014,
 'availability_365_removed': 0.7813682380611514}

In [113]:
# Compare with original accuracy
diffs_dict = {k:abs(v-model_accuracy) for k, v in accuracy_dict.items()}
diffs_dict
    

{'all': 0.0,
 'neighbourhood_group_removed': 0.04090397791185196,
 'room_type_removed': 0.060844667143879816,
 'latitude_removed': 0.003885877901625978,
 'longitude_removed': 0.0037836179568463413,
 'minimum_nights_removed': 0.0007158196134573469,
 'number_of_reviews_removed': 0.0007158196134573469,
 'reviews_per_month_removed': 0.0,
 'calculated_host_listings_count_removed': 0.0011248593925760053,
 'availability_365_removed': 0.009407914919726035}

In [114]:
diffs_dict.pop('all', None)
min(diffs_dict.items(), key = lambda k : k[1])
max(diffs_dict.items(), key = lambda k : k[1])


0.0

('reviews_per_month_removed', 0.0)

('room_type_removed', 0.060844667143879816)

The feature which had the smallest difference is reviews_per_month (it actually made no difference to the accuracy when removed).

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data.
* This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

In [115]:
from sklearn.linear_model import Ridge

In [116]:
y_train_log = np.log1p(train_price.values)
y_val_log = np.log1p(val_price.values)
y_test_log = np.log1p(test_price.values)

#### Train Model

In [117]:
# Checking with alpha of 0.1
model = Ridge(alpha=0.1)
model.fit(X_train, y_train_log)
model
y_ridge_pred = model.predict(X_val)
y_ridge_pred

Ridge(alpha=0.1)

Ridge(alpha=0.1)

array([4.1142478 , 5.21378968, 5.01017209, ..., 4.55940994, 4.19400898,
       5.14469281])

In [118]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [119]:
ridge_rmses = {}
alpha_list = [0, 0.01, 0.1, 1, 10]
for a in alpha_list:
    model = Ridge(alpha=a)
    model.fit(X_train, y_train_log)
    y_ridge_pred = model.predict(X_val)
    rmse_score = rmse(y_val_log, y_ridge_pred).round(3)
    ridge_rmses[a] = rmse_score
ridge_rmses

Ridge(alpha=0)

Ridge(alpha=0.01)

Ridge(alpha=0.1)

Ridge(alpha=1)

Ridge(alpha=10)

{0: 0.497, 0.01: 0.497, 0.1: 0.497, 1: 0.497, 10: 0.498}

Therefore the best RMSE based on 3 d.p. (with the smallest alpha) is 0, which gives an RMSE of 0.497.