In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../data/AB_NYC_2019.csv')
len(df)

48895

In [3]:
homework_cols = ['latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month',
'calculated_host_listings_count','availability_365']
df = df[homework_cols]

In [4]:
df.shape

(48895, 8)

# Q1

Find a feature with missing values. How many missing values does it have?

In [5]:
df.isna().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   latitude                        48895 non-null  float64
 1   longitude                       48895 non-null  float64
 2   price                           48895 non-null  int64  
 3   minimum_nights                  48895 non-null  int64  
 4   number_of_reviews               48895 non-null  int64  
 5   reviews_per_month               38843 non-null  float64
 6   calculated_host_listings_count  48895 non-null  int64  
 7   availability_365                48895 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 3.0 MB


# Q2

What's the median (50% percentile) for variable 'minimum_nights'?

In [7]:
df['minimum_nights'].median()

3.0

- Shuffle the initial dataset, use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Make sure that the target value ('price') is not in your dataframe.
- Apply the log transformation to the price variable using the np.log1p() function.

In [8]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [9]:
y_train_orig = df_train.price.values
y_val_orig = df_val.price.values
y_test_orig = df_test.price.values

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

# Q3

- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lesssons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29337 entries, 879 to 29114
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   latitude                        29337 non-null  float64
 1   longitude                       29337 non-null  float64
 2   minimum_nights                  29337 non-null  int64  
 3   number_of_reviews               29337 non-null  int64  
 4   reviews_per_month               23339 non-null  float64
 5   calculated_host_listings_count  29337 non-null  int64  
 6   availability_365                29337 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 1.8 MB


In [11]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [12]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [13]:
reviews_per_month_mean = df_train['reviews_per_month'].mean()
reviews_per_month_mean

1.3685492094776983

In [14]:
df_train.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
879,40.64354,-73.97777,3,62,0.71,1,189
44383,40.70666,-73.90779,21,0,,1,73
15394,40.76116,-73.99016,2,17,0.43,1,0
43230,40.70763,-74.0105,2,5,1.88,327,272
16332,40.79658,-73.93287,2,30,0.8,1,30


In [15]:
df_train.fillna({'reviews_per_month':0}).head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
879,40.64354,-73.97777,3,62,0.71,1,189
44383,40.70666,-73.90779,21,0,0.0,1,73
15394,40.76116,-73.99016,2,17,0.43,1,0
43230,40.70763,-74.0105,2,5,1.88,327,272
16332,40.79658,-73.93287,2,30,0.8,1,30


In [16]:
df_train.fillna({'reviews_per_month':reviews_per_month_mean}).head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
879,40.64354,-73.97777,3,62,0.71,1,189
44383,40.70666,-73.90779,21,0,1.368549,1,73
15394,40.76116,-73.99016,2,17,0.43,1,0
43230,40.70763,-74.0105,2,5,1.88,327,272
16332,40.79658,-73.93287,2,30,0.8,1,30


In [17]:
X_train_mean = df_train.fillna({'reviews_per_month':reviews_per_month_mean})
X_train_zero = df_train.fillna({'reviews_per_month':0})
w_0_mean, w_mean = train_linear_regression(X_train_mean, y_train)
w_0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

In [18]:
X_val_mean = df_val.fillna({'reviews_per_month':reviews_per_month_mean})
X_val_zero = df_val.fillna({'reviews_per_month':0})
y_pred_mean = w_0_mean + X_val_mean.dot(w_mean)
y_pred_zero = w_0_zero + X_val_mean.dot(w_zero)

In [19]:
rmse_mean = rmse(y_val, y_pred_mean)
rmse_zero = rmse(y_val, y_pred_zero)
print("rmse_mean:",round(rmse_mean,2))
print("rmse_zero:",round(rmse_zero,2))

rmse_mean: 0.64
rmse_zero: 0.64


In [20]:
print(y_pred_mean[:10].values)
print(y_pred_zero[:10].values)

[4.56404917 4.47586466 4.72706726 5.06651567 3.72361944 4.92676964
 5.06439302 4.56156357 5.12961035 4.71497667]
[4.57853271 4.48375564 4.73586366 5.06898417 3.74125331 4.91527899
 5.07459426 4.57846823 5.13605295 4.70475063]


Answer: similar

# Q4

- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

In [21]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [22]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train_zero, y_train, r=r)

    y_pred = w_0 + X_val_zero.dot(w)
    print('val', round(rmse(y_val, y_pred),2))

val 0.64
val 0.64
val 0.64
val 0.64
val 0.66
val 0.68
val 0.68
val 0.68
val 0.68


Answer is 0

# Q5

- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [23]:
rmses = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    y_train_orig = df_train.price.values
    y_val_orig = df_val.price.values
    y_test_orig = df_test.price.values

    y_train = np.log1p(df_train.price.values)
    y_val = np.log1p(df_val.price.values)
    y_test = np.log1p(df_test.price.values)

    del df_train['price']
    del df_val['price']
    del df_test['price']
    X_train_zero = df_train.fillna({'reviews_per_month':0})
    w_0_zero, w_zero = train_linear_regression(X_train_zero, y_train)
    X_val_zero = df_val.fillna({'reviews_per_month':0})
    y_pred_zero = w_0_zero + X_val_zero.dot(w_zero)
    rmse_zero = rmse(y_val, y_pred_zero)
    print("seed:", seed, "rmse:",rmse_zero)
    rmses.append(rmse_zero)
#print(rmses)
print("std:", round(np.std(rmses),3))

seed: 0 rmse: 0.654977996098256
seed: 1 rmse: 0.6462523685596203
seed: 2 rmse: 0.6476558176507924
seed: 3 rmse: 0.6375145070225107
seed: 4 rmse: 0.6445809083240496
seed: 5 rmse: 0.6305809996021178
seed: 6 rmse: 0.6297851916035567
seed: 7 rmse: 0.650618426693912
seed: 8 rmse: 0.6489780353511724
seed: 9 rmse: 0.6437565168320449
std: 0.008


# Q6

- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Train a model with r=0.001.
- What's the RMSE on the test dataset?

In [25]:
np.random.seed(9)
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()
df_trainval = df_shuffled.iloc[:n_train+n_val].copy()

y_train_orig = df_train.price.values
y_val_orig = df_val.price.values
y_test_orig = df_test.price.values
y_trainval_orig = df_trainval.price.values

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)
y_trainval = np.log1p(df_trainval.price.values)

del df_train['price']
del df_val['price']
del df_test['price']
del df_trainval['price']

X_train_zero = df_trainval.fillna({'reviews_per_month':0})
w_0_zero, w_zero = train_linear_regression_reg(X_train_zero, y_trainval, 0.001)

X_test_zero = df_test.fillna({'reviews_per_month':0})
y_pred_zero = w_0_zero + X_test_zero.dot(w_zero)
rmse_zero = rmse(y_test, y_pred_zero)
print("seed:", seed, "rmse:",rmse_zero)

seed: 9 rmse: 0.6452771348364293
