In [34]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

## Question1

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv")

In [3]:
features= ['neighbourhood_group','room_type','latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']

In [4]:
df = data[features]

In [5]:
df=df.fillna(0)

In [6]:
df['neighbourhood_group'].mode(dropna=False)

0    Manhattan
dtype: object

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [8]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

## Question2

In [11]:
corr_df = df.corr()

In [12]:
corr_df.unstack().sort_values(ascending=False).head(len(features))

latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
number_of_reviews               number_of_reviews                 1.000000
minimum_nights                  minimum_nights                    1.000000
price                           price                             1.000000
availability_365                availability_365                  1.000000
reviews_per_month               number_of_reviews                 0.589407
number_of_reviews               reviews_per_month                 0.589407
dtype: float64

In [105]:
above_average = (df_full_train.price <= 152).astype(int)

In [106]:
above_average

32645    0
23615    1
31183    1
29260    1
7275     1
        ..
11284    1
44732    1
38158    1
860      1
15795    1
Name: price, Length: 39116, dtype: int64

## Question3

In [107]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [108]:
categorical_columns

['neighbourhood_group', 'room_type']

In [109]:
def mutual_info_score_to_price(x):
    return mutual_info_score(x, df_full_train.price)


In [110]:
mi = round(df_full_train[categorical_columns].apply(mutual_info_score_to_price),2)
mi.sort_values(ascending=False)

room_type              0.31
neighbourhood_group    0.11
dtype: float64

## Question4

In [136]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)

In [137]:
numerical.remove("price")

In [113]:
numerical

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [124]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns+numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns+numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_train_binary = (y_train <= 152).astype(int)
y_val_binary = (y_val <= 152).astype(int)

In [125]:
model = LogisticRegression(solver="liblinear", C=1.0, random_state=42)

In [126]:
model.fit(X_train, y_train_binary)

LogisticRegression(random_state=42, solver='liblinear')

In [127]:
model.intercept_[0]

0.09546761934671312

In [128]:
model.coef_[0].round(3)

array([-3.000e-03, -3.000e-03,  5.833e+00,  3.175e+00,  1.200e-02,
        7.600e-02, -1.320e-01, -1.587e+00,  1.600e-02,  1.722e+00,
        3.000e-03,  4.200e-02, -1.944e+00,  8.350e-01,  1.205e+00])

In [129]:
y_pred = model.predict_proba(X_val)[:, 1]

In [130]:
price_decision = (y_pred >= 0.5)

In [132]:
common_acc = round((y_val_binary == price_decision).mean(),2)

In [133]:
common_acc

0.79

## Question5

In [166]:
no_columns = df_val.columns      
less_diff=10000000000
base  = df_train.columns
res = ""


for n in no_columns:
    
    subset_base = list(filter(lambda x: ( x != n), base)) 
    df_train_subset = df_train[subset_base]
    df_val_subset = df_val[subset_base]
    
    dictionary_train_base_subset = df_train_subset.to_dict(orient='records')
    X_train_subset = dv.transform(dictionary_train_base_subset)
    
    dictionary_val_base_subset = df_val_subset = df_val[subset_base].to_dict(orient='records')
    X_val_subset = dv.transform(dictionary_val_base_subset)
    
    model = LogisticRegression(solver="liblinear", C=1.0, random_state=42)
    
    model.fit(X_train_subset, y_train_binary)
    
    y_pred = model.predict_proba(X_val_subset)[:, 1]
    acc = (y_val_binary == price_decision).mean()
    if less_diff > (common_acc - acc):
        less_diff = (common_acc - acc)
        res = n
        
print(less_diff)
print(res)

-0.000264853256979225
neighbourhood_group


## Question 6

In [180]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [168]:
from sklearn.linear_model import Ridge

In [182]:
log_price_train = np.log1p(y_train)
log_price_val = np.log1p(y_val)

In [183]:
alpha_indexes = [0, 0.01, 0.1, 1, 10]

In [187]:
for n in alpha_indexes:
    model = Ridge(alpha=n)
    model.fit(X_train, log_price_train)
    log_y_pred = linridge.predict(X_val)
    res = rmse(log_price_val, log_y_pred)
    print(n, round(res,3))

0 0.498
0.01 0.498
0.1 0.498
1 0.498
10 0.498
