In [3]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv('data/airbnb3.csv')

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,2539,2595,3647,3831,5022
name,Clean & quiet apt home by the park,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park
host_id,2787,2845,4632,4869,7192
host_name,John,Jennifer,Elisabeth,LisaRoxanne,Laura
neighbourhood_group,Brooklyn,Manhattan,Manhattan,Brooklyn,Manhattan
neighbourhood,Kensington,Midtown,Harlem,Clinton Hill,East Harlem
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
room_type,Private room,Entire home/apt,Private room,Entire home/apt,Entire home/apt
price,149,225,150,89,80


In [4]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [5]:
df = df[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

In [6]:
df.fillna(0,inplace=True)

In [7]:
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,0.0,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


### Ques : 01 

* What is the most frequent observation (mode) for the column 'neighbourhood_group'?



In [9]:
df['neighbourhood_group'].mode()

0    Manhattan
dtype: object

* Split the data

In [10]:
y = df['price'].values
X = df
del X['price']


In [11]:
X.columns

Index(['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [13]:
len(X_train), len(X_test), len(X_val)

(31292, 9779, 7824)

In [14]:
len(y_train), len(y_test), len(y_val)

(31292, 9779, 7824)

In [15]:
numerical = ['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']

categorical=['neighbourhood_group', 'room_type']

In [16]:
X_train_df = pd.DataFrame(X_train, columns=['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'])

In [17]:
X_train_df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Entire home/apt,40.71754,-73.95906,2,15,1.14,1,0
1,Manhattan,Entire home/apt,40.78784,-73.94998,5,88,2.23,1,326
2,Manhattan,Entire home/apt,40.7358,-73.9889,1,68,2.67,1,324
3,Brooklyn,Private room,40.70921,-73.94144,1,2,0.07,1,0
4,Manhattan,Entire home/apt,40.7241,-73.98959,7,9,0.35,1,0


In [18]:
df_new_numerical=X_train_df[['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']]

In [19]:
cols=df_new_numerical.columns
df_new_numerical[cols] = pd.to_numeric(df_new_numerical[cols].stack(), errors='coerce').unstack()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Correlation Matrix
Quest : 03

In [20]:
df_new_numerical.corr(method ='pearson')

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
minimum_nights,1.0,-0.074459,-0.118436,0.114218,0.136383
number_of_reviews,-0.074459,1.0,0.591234,-0.072782,0.174931
reviews_per_month,-0.118436,0.591234,1.0,-0.047711,0.166007
calculated_host_listings_count,0.114218,-0.072782,-0.047711,1.0,0.226329
availability_365,0.136383,0.174931,0.166007,0.226329,1.0


In [21]:
y_above_average_train=np.where((y_train>=152),1, 0)

In [22]:
y_above_average_val=np.where((y_val>=152),1, 0)

### Mutual Information

In [23]:
round(mutual_info_score(X_train_df.neighbourhood_group, y_above_average), 2)

NameError: name 'y_above_average' is not defined

In [24]:
round(mutual_info_score(X_train_df.room_type, y_above_average),2)

NameError: name 'y_above_average' is not defined

In [25]:
train_dicts = X_train_df.to_dict(orient='records')
X_train_dicts = DictVectorizer(sparse=False).fit_transform(train_dicts)


In [26]:
val_df=pd.DataFrame(X_val, columns=['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'])

val_dicts = val_df.to_dict(orient='records')
val_dicts = DictVectorizer(sparse=False).fit_transform(val_dicts)


In [27]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train_dicts, y_above_average_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [28]:
y_pred=model.predict_proba(val_dicts)[:,1]
decisions=(y_pred >= 0.5)
val_df[decisions].head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
1,Brooklyn,Entire home/apt,40.68498,-73.96618,14,4,0.11,2,343
4,Manhattan,Entire home/apt,40.76075,-73.99893,30,0,0.0,18,365
6,Manhattan,Entire home/apt,40.73243,-74.00932,4,1,0.03,1,0
7,Manhattan,Entire home/apt,40.8063,-73.96268,7,2,0.05,1,0
9,Manhattan,Entire home/apt,40.80861,-73.94574,3,123,1.73,3,248


In [29]:
original_accuracy=round((y_above_average_val == decisions).mean(),2)
original_accuracy

0.79

### Quest : 5

* read data
* drop feature
* split
* train
* accuracy



In [30]:
def read_data():
    df = pd.read_csv('data/airbnb3.csv')
    return df


def hot_coding(y):
    return np.where((y>=152),1, 0)

def encoding(X, lst):
    df=pd.DataFrame(X, columns=lst)
    X_dicts = df.to_dict(orient='records')
    return DictVectorizer(sparse=False).fit_transform(X_dicts)

    
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

def train(X, y):
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    return model.fit(X, y)

def predict_accuracy(model, val, y):
    df_pred = pd.DataFrame()
    df_pred['probability'] = model.predict_proba(val)[:,1]
    df_pred['prediction'] = ((model.predict_proba(val)[:,1]) >= 0.5).astype(int)
    df_pred['actual'] = y
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return df_pred.correct.mean()

#### Drop *neighbourhood_group*

In [31]:
df_neighbourhood_group = read_data()

df_neighbourhood_group = df_neighbourhood_group[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

df_neighbourhood_group.fillna(0,inplace=True)
y = df_neighbourhood_group['price'].values
y = hot_coding(y)
X = df_neighbourhood_group
del X['price']
del X['neighbourhood_group']

# X = encoding(X)


X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)

# encoding , convert train and val back to pandas
lst = ['room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']


X_train = encoding(X_train, lst)
X_val = encoding(X_val, lst)


model = train(X_train, y_train)

accuracy_neighbourhood_group = predict_accuracy(model, X_val, y_val)
original_accuracy - accuracy_neighbourhood_group

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.03744376278118611

#### Drop *room_type*

In [33]:
df_room_type = read_data()

df_room_type = df_room_type[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

df_room_type.fillna(0,inplace=True)
y = df_room_type['price'].values
y = hot_coding(y)
X = df_room_type
del X['price']
del X['room_type']

# X = encoding(X)


X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)


# encoding , convert train and val back to pandas
lst = ['neighbourhood_group', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']


X_train = encoding(X_train, lst)
X_val = encoding(X_val, lst)


model = train(X_train, y_train)

accuracy_room_type = predict_accuracy(model, X_val, y_val)
original_accuracy - accuracy_room_type

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.07425357873210636

#### Drop *number_of_reviews*

In [35]:
df_number_of_reviews = read_data()

df_number_of_reviews = df_number_of_reviews[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

df_number_of_reviews.fillna(0,inplace=True)
y = df_number_of_reviews['price'].values
y = hot_coding(y)
X = df_number_of_reviews
del X['price']
del X['number_of_reviews']

# X = encoding(X)


X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)

# encoding , convert train and val back to pandas
lst = ['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']


X_train = encoding(X_train, lst)
X_val = encoding(X_val, lst)


model = train(X_train, y_train)

accuracy_neighbourhood_group = predict_accuracy(model, X_val, y_val)
original_accuracy - accuracy_neighbourhood_group

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.00293456032719841

#### Drop *reviews_per_month*

In [37]:
df_reviews_per_month = read_data()

df_reviews_per_month = df_reviews_per_month[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

df_reviews_per_month.fillna(0,inplace=True)
y = df_reviews_per_month['price'].values
y = hot_coding(y)
X = df_reviews_per_month
del X['price']
del X['reviews_per_month']

# X = encoding(X)


X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)

# encoding , convert train and val back to pandas
lst = ['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'availability_365']


X_train = encoding(X_train, lst)
X_val = encoding(X_val, lst)


model = train(X_train, y_train)

accuracy_neighbourhood_group = predict_accuracy(model, X_val, y_val)
original_accuracy - accuracy_neighbourhood_group

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.004340490797546059


### Linear regression

In [38]:
def divData(df, seed):
    n = len(df)
    n_val = int(n*0.2)
    n_test = int(n*0.2)
    n_train = n-n_val-n_test
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    y_train = np.log1p(df_train['price']).values
    y_val = np.log1p(df_val['price']).values
    y_test = np.log1p(df_test['price']).values
    del df_train['price']
    del df_val['price']
    del df_test['price']
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [66]:
def rmse(y, y_pred):
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

False

In [63]:
df = read_data()
df = df[['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']]

df = df.fillna(0)
# df = df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

df_train, df_val, df_test, y_train, y_val, y_test = divData(df, 42)


df.fillna(0,inplace=True)
y = df['price'].values
y = np.log1p(y)
X = df
del X['price']



X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)

# encoding , convert train and val back to pandas
lst = ['neighbourhood_group','room_type','latitude','longitude','price',
 'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
 'availability_365']


X_train = encoding(X_train, lst)
X_val = encoding(X_val, lst)

X_train = np.nan_to_num(X_train, 0)
X_val = np.nan_to_num(X_val, 0)
model = Ridge(alpha=0.1)
    
model.fit(X_train, y_train)
prediction = model.predict(X_val)

In [46]:
alphas = [0, 0.01, 0.1, 1, 10]

array([4.1116714 , 5.2125734 , 5.0062276 , ..., 3.96704047, 5.26803039,
       4.8199891 ])

In [68]:
for alpha in alphas:
    df = read_data()
    df = df[['neighbourhood_group','room_type','latitude','longitude','price',
     'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
     'availability_365']]

    df.fillna(0,inplace=True)
    y = df['price'].values
    y = np.log1p(y)
    X = df
    del X['price']



    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train, X_val, y_train, y_val = split_data(X_train, y_train)

    # encoding , convert train and val back to pandas
    lst = ['neighbourhood_group','room_type','latitude','longitude','price',
     'minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count',
     'availability_365']


    X_train = encoding(df_train, lst)
    X_val = encoding(df_val, lst)
    
    X_train = np.nan_to_num(X_train, 0)
    X_val = np.nan_to_num(X_val, 0)
    
    model = Ridge(alpha=alpha)
    
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    
    
    print(f'Alpha: {alpha}, RMSE: {round(rmse(prediction, y_val),3)}')
    
    print('')

ValueError: Found input variables with inconsistent numbers of samples: [29337, 31292]