In [2]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from IPython.display import HTML
style = "<style>svg{width: 40% !important; height: 40% !important;} </style>"
HTML(style)

## Question1

In [3]:
common_df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv')

In [4]:
features = ['latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']

In [5]:
df = common_df[features]

In [6]:
df.isna().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

## Question2

In [7]:
median = df['minimum_nights'].median()

In [8]:
median

3.0

In [9]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [10]:
n

48895

In [11]:
n_val, n_test, n_train

(9779, 9779, 29337)

In [12]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [13]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [14]:
random_set = np.arange(n)

In [15]:
np.random.seed(42)

In [16]:
np.random.shuffle(random_set)

In [17]:
df_train = df.iloc[random_set[:n_train]]
df_val = df.iloc[random_set[n_train:n_train+n_val]]
df_test = df.iloc[random_set[n_train+n_val:]]

In [18]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [19]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [20]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

In [21]:
del df_train['price']
del df_val['price']
del df_test['price']

## Question3

In [22]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [23]:
def prepare_X(df):
    df = df.copy()
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X

In [24]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [25]:
X_train = df_train.fillna(0).values

w0, w = train_linear_regression(X_train, y_train)

y_pred_fill_zero= w0 + X_train.dot(w)

In [26]:
y_pred_fill_zero

array([4.7827747 , 4.45947456, 4.89702083, ..., 4.82048871, 4.86944871,
       4.61612559])

In [27]:
rmse_fill_zero = rmse(y_train, y_pred_fill_zero)

In [28]:
X_train = df_train.fillna(df.mean()).values

w0, w = train_linear_regression(X_train, y_train)

y_pred_fill_mean = w0 + X_train.dot(w)

In [29]:
rmse_fill_mean =rmse(y_train, y_pred_fill_mean)

In [30]:
round(rmse_fill_mean, 2)

0.64

In [31]:
round(rmse_fill_zero, 2)

0.64

## Question4

In [32]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

NameError: name 'base' is not defined

In [33]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [34]:
base = ['latitude','longitude','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']

## Question 5

In [35]:
using_df = common_df[features]

seed_array = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

res_score = np.zeros(len(seed_array), dtype='float32')

for index, s in np.ndenumerate(seed_array):
    np.random.seed(s)
    np.random.shuffle(random_set)
    
    new_df_train = using_df.iloc[random_set[:n_train]]
    new_df_val = using_df.iloc[random_set[n_train:n_train+n_val]]
    new_df_test = using_df.iloc[random_set[n_train+n_val:]]
    
    new_df_train = new_df_train.reset_index(drop=True)
    new_df_val = new_df_val.reset_index(drop=True)
    new_df_test = new_df_test.reset_index(drop=True)
    
    
    new_y_train = np.log1p(new_df_train.price.values)
    new_y_val = np.log1p(new_df_val.price.values)
    new_y_test = np.log1p(new_df_test.price.values)
   
    del new_df_train['price']
    del new_df_val['price']
    del new_df_test['price']
    
    new_X_train = prepare_X(new_df_train)
    
    new_w0, new_w = train_linear_regression(new_X_train, new_y_train)
    
    new_X_val = prepare_X(new_df_val)
    
    new_y_pred = new_w0 + new_X_val.dot(new_w)
    
    score =rmse(new_y_val, new_y_pred)
    res_score[index]= score
    
    
print(res_score)
res = round(np.std(res_score),3)
print(res)
    

[0.6442382  0.63990843 0.643185   0.63572544 0.64450485 0.6580019
 0.6478876  0.6430618  0.64940655 0.6548455 ]
0.006


## Question6

In [47]:
using_df = common_df[features]

seed = 9

np.random.seed(seed)
np.random.shuffle(random_set)
    
q6_df_train = using_df.iloc[random_set[:n_train]]
q6_df_val = using_df.iloc[random_set[n_train:n_train+n_val]]
q6_df_test = using_df.iloc[random_set[n_train+n_val:]]
df_total = pd.concat([q6_df_train, q6_df_val], axis=0)

q6_df_test = q6_df_test.reset_index(drop=True)

y_total = np.log1p(df_total.pop('price'))
q6_y_test = np.log1p(q6_df_test.pop('price'))

X_common = prepare_X(df_total)
X_test = prepare_X(q6_df_test)

w0, w = train_linear_regression_reg(X_common, y_total, r=0.001)


y_pred = w0 + X_test.dot(w)
score = rmse(q6_y_test, y_pred)   
round(score,2)   

0.65