In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../linear_regression_models/AB_NYC_2019.csv')

In [3]:
base = ['neighbourhood_group',
       'room_type', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']

In [4]:
df_base = df[base]

In [5]:
df_base.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [6]:
df_base = df_base.fillna(0)

In [7]:
# Let's make data look nicer

df_base.columns = df_base.columns.str.lower().str.replace(" ", "_")

cols = list(df_base.dtypes[df_base.dtypes == 'object'].index)

for col in cols:
    df_base[col] = df_base[col].str.lower().str.replace(" ", "_")

In [8]:
df_base.head(3)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,manhattan,private_room,40.80902,-73.9419,150,3,0,0.0,1,365


# Q1

In [9]:
df_base['neighbourhood_group'].value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

In [10]:
len(df_base)

48895

In [11]:
full_train, df_test = train_test_split(df_base, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(full_train, test_size=0.25, random_state=42)

In [12]:
len(full_train), len(df_test)

(39116, 9779)

In [13]:
len(df_train), len(df_valid)

(29337, 9779)

In [14]:
y_full_train = full_train['price'].values
y_test = df_test['price'].values
y_train = df_train['price'].values
y_valid = df_valid['price'].values

In [15]:
full_train = full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

In [16]:
del full_train['price']
del df_test['price']
del df_train['price']
del df_valid['price']

In [17]:
y_full_train = (y_full_train >= 152).astype(int)
y_test = (y_test >= 152).astype(int)
y_train = (y_train >= 152).astype(int)
y_valid = (y_valid >= 152).astype(int)

# Q2

In [18]:
full_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [19]:
full_train.nunique()

neighbourhood_group                   5
room_type                             3
latitude                          17462
longitude                         13527
minimum_nights                      100
number_of_reviews                   376
reviews_per_month                   903
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [20]:
numerical = ['latitude', 'longitude', 'minimum_nights',
            'number_of_reviews', 'reviews_per_month',
            'calculated_host_listings_count', 'availability_365']

categorical = ['neighbourhood_group', 'room_type']

In [21]:
# correlation matrix
matrix = df_base[numerical].corr().to_dict()

In [22]:
def give_highest(matrix):
    for key, inner_dict in matrix.items():
        del inner_dict[key]
        result = dict(sorted(inner_dict.items(), key=lambda item: item[1], reverse=True))
        matrix[key] = result
    
    return matrix

In [23]:
give_highest(matrix)

{'latitude': {'longitude': 0.08478836838942543,
  'minimum_nights': 0.024869274138726128,
  'calculated_host_listings_count': 0.019517351185378132,
  'availability_365': -0.010983458290208541,
  'number_of_reviews': -0.015388804497945684,
  'reviews_per_month': -0.01875772712330614},
 'longitude': {'reviews_per_month': 0.1385161659552922,
  'latitude': 0.08478836838942543,
  'availability_365': 0.08273074786310534,
  'number_of_reviews': 0.05909428794877501,
  'minimum_nights': -0.06274711429076898,
  'calculated_host_listings_count': -0.11471279117178322},
 'minimum_nights': {'availability_365': 0.14430306319924938,
  'calculated_host_listings_count': 0.1279596294349121,
  'latitude': 0.024869274138726128,
  'longitude': -0.06274711429076898,
  'number_of_reviews': -0.08011606824164533,
  'reviews_per_month': -0.1249049651159733},
 'number_of_reviews': {'reviews_per_month': 0.5894072970835077,
  'availability_365': 0.17202758146293173,
  'longitude': 0.05909428794877501,
  'latitude':

In [25]:
df_base[numerical].corr().unstack().sort_values(ascending=False)

latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
minimum_nights                  minimum_nights                    1.000000
number_of_reviews               number_of_reviews                 1.000000
availability_365                availability_365                  1.000000
number_of_reviews               reviews_per_month                 0.589407
reviews_per_month               number_of_reviews                 0.589407
availability_365                calculated_host_listings_count    0.225701
calculated_host_listings_count  availability_365                  0.225701
availability_365                number_of_reviews                 0.172028
number_of_reviews               availability_365                  0.172028
availability_365         

<h3>Turn price into binary</h3>

In [216]:
above_average = (df_base.price >= 152).astype(int)

In [217]:
len(above_average)

48895

In [218]:
new_df = pd.DataFrame()
new_df['binary'] = above_average
new_df['price'] = df_base.price

In [367]:
new_df.head(5)

Unnamed: 0,binary,price
0,0,149
1,1,225
2,0,150
3,0,89
4,0,80


# Q3

In [220]:
from sklearn.metrics import mutual_info_score

In [221]:
def calc_mutual(series):
    return mutual_info_score(series, above_average)

In [222]:
result = df_base[categorical].apply(calc_mutual)

In [223]:
round(result.sort_values(ascending=False).to_frame('Mut. info'), 2)

Unnamed: 0,Mut. info
room_type,0.14
neighbourhood_group,0.05


# Q4

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [29]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
valid_dict = df_valid[categorical + numerical].to_dict(orient='records')

In [30]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [31]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [32]:
model = LogisticRegression(solver='liblinear', dual=False, max_iter=2000,
                           C=1.0, random_state=42)

In [33]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=42, solver='liblinear')

In [34]:
model.intercept_

array([-0.09158457])

In [35]:
model.coef_.round(3)

array([[ 3.000e-03,  4.000e-03, -5.818e+00, -3.166e+00, -1.100e-02,
        -8.200e-02,  1.250e-01,  1.576e+00, -2.900e-02, -1.681e+00,
        -3.000e-03, -4.200e-02,  1.957e+00, -8.200e-01, -1.228e+00]])

In [36]:
X_valid = dv.transform(valid_dict)

In [38]:
soft_predictions = model.predict_proba(X_valid)[:, 1]

In [39]:
soft_predictions

array([0.02878758, 0.59581868, 0.4262335 , ..., 0.11418704, 0.03457736,
       0.52841542])

In [40]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(2)))

{'availability_365': 0.0,
 'calculated_host_listings_count': 0.0,
 'latitude': -5.82,
 'longitude': -3.17,
 'minimum_nights': -0.01,
 'neighbourhood_group=bronx': -0.08,
 'neighbourhood_group=brooklyn': 0.13,
 'neighbourhood_group=manhattan': 1.58,
 'neighbourhood_group=queens': -0.03,
 'neighbourhood_group=staten_island': -1.68,
 'number_of_reviews': -0.0,
 'reviews_per_month': -0.04,
 'room_type=entire_home/apt': 1.96,
 'room_type=private_room': -0.82,
 'room_type=shared_room': -1.23}

In [41]:
decision_threshold = (soft_predictions >= 0.5)

In [42]:
decision_threshold

array([False,  True, False, ..., False, False,  True])

In [238]:
round((decision_threshold == y_valid).mean(),2)

0.79

In [239]:
df_temp = pd.DataFrame()
df_temp['soft pred.'] = soft_predictions
df_temp['pred.'] = decision_threshold.astype(int)
df_temp['real data'] = y_valid
df_temp['correct'] = df_temp['pred.'] == df_temp['real data']

In [240]:
df_temp

Unnamed: 0,soft pred.,pred.,real data,correct
0,0.028788,0,0,True
1,0.595819,1,0,False
2,0.426234,0,1,False
3,0.074923,0,0,True
4,0.811784,1,1,True
...,...,...,...,...
9774,0.637550,1,1,True
9775,0.010139,0,0,True
9776,0.114187,0,0,True
9777,0.034577,0,0,True


# Q5

In [315]:
class FeatureElimination:
    def __init__(self, initial_acc, train_df, valid_df, y_train, y_valid):
        self.initial_acc = initial_acc
        self.train_df = train_df
        self.valid_df = valid_df
        self.y_tr = y_train
        self.y_val = y_valid
        self.categorical = ['neighbourhood_group', 'room_type']
        self.numerical = ['latitude', 'longitude', 'minimum_nights',
            'number_of_reviews', 'reviews_per_month',
            'calculated_host_listings_count', 'availability_365']
        self.dv = DictVectorizer(sparse=False)
        self.model = LogisticRegression(solver='liblinear', dual=False, max_iter=2000,
                                        C=1.0, random_state=42)
    
    
    def get_result(self):
        all_features = self.categorical + self.numerical
        levels = {}
        
        for i in range(len(all_features)):
            curr_drop = all_features[i]
            
            features = self._remove_element(all_features, i)
            X_train, X_valid = self._transform_data(self.train_df, self.valid_df, features)
            
            self._train_model(X_train, self.y_tr)
            soft_predictions = self._predict_data(X_valid)
            curr_acc = self._get_accuracy(soft_predictions)
            
            levels[curr_drop] = (round(curr_acc, 2), self.initial_acc - curr_acc)
        
        return levels    
    
    def _remove_element(self, features, idx):
        if idx == 0:
            features = features[1:]
        elif idx == len(features) - 1:
            features = features[:-1]
        else:
            features = features[:idx] + features[idx + 1:]
        
        return features
    
    def _transform_data(self, train, valid, features):
        train_dict = train[features].to_dict(orient='records')
        valid_dict = valid[features].to_dict(orient='records')
        
        X_train = dv.fit_transform(train_dict)
        X_valid = dv.transform(valid_dict)
        
        return X_train, X_valid
    
    def _train_model(self, X, y):
        self.model.fit(X, y)
    
    def _predict_data(self, X):
        return self.model.predict_proba(X)[:, 1]
    
    def _get_accuracy(self, data, threshold=0.5):
        price_level = (data >= threshold)
        result = (price_level == self.y_val).mean()
        return result


In [316]:
feature_elim = FeatureElimination(0.79, df_train, df_valid, y_train, y_valid)
feature_elim.get_result()

{'neighbourhood_group': (0.75, 0.040127824930974554),
 'room_type': (0.73, 0.06006851416300241),
 'latitude': (0.79, 0.003109724920748569),
 'longitude': (0.79, 0.003007464975968932),
 'minimum_nights': (0.79, -0.0014919725943347562),
 'number_of_reviews': (0.79, -0.0014919725943347562),
 'reviews_per_month': (0.79, -0.0007761529808774092),
 'calculated_host_listings_count': (0.79, 0.00034870641169859606),
 'availability_365': (0.78, 0.008631761938848626)}

# Q6

In [324]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [329]:
df_base.price = np.log1p(df_base.price)

In [330]:
full_train, df_test = train_test_split(df_base, test_size=0.2, random_state=5)
df_train, df_valid = train_test_split(full_train, test_size=0.25, random_state=5)

In [331]:
y_full_train = full_train.price.values
y_test = df_test.price.values
y_train = df_train.price.values
y_valid = df_valid.price.values

In [332]:
del full_train['price']
del df_test['price']
del df_train['price']
del df_valid['price']

In [334]:
df_base.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [353]:
dict_train = df_train.to_dict(orient='records')
dict_valid = df_valid.to_dict(orient='records')

In [354]:
dv = DictVectorizer(sparse=False)

In [355]:
X_train = dv.fit_transform(dict_train)
X_valid = dv.transform(dict_valid)

In [364]:
from sklearn.metrics import mean_squared_error

def train_predict(a=0):
    model = Ridge(alpha=a)
    model.fit(X_train, y_train)
    
    pred = model.predict(X_valid)  
    rmse = np.sqrt(mean_squared_error(y_valid, pred))
    return round(rmse, 7)

In [365]:
results = {}

for i in [0, 0.01, 0.1, 1, 10]:
    results[i] = train_predict(a=i)

  return linalg.solve(A, Xy, sym_pos=True,


In [366]:
results

{0: 0.4863137, 0.01: 0.4863136, 0.1: 0.486313, 1: 0.4863199, 10: 0.4870211}