# **Part1: Feature Engineer**

Part 1.1: Read data

In [22]:
import os
from google.colab import drive
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [23]:
# mount google drive
# Mount Google Drive
drive.mount('/content/drive')

# Navigate to your Google Drive home directory
os.chdir('/content/drive/MyDrive')

# Now you can use os.getcwd() to see your current directory
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [24]:
# import data
filename = os.path.join(os.getcwd(), "Break Through AI ", "Code Practise", "data", "airbnbListingsData.csv")
# convert to pandas dataframe
df = pd.read_csv(filename, header = 0)

In [25]:
# dive into data
print(df.shape)
df.head()

(28022, 50)


Unnamed: 0,name,description,neighborhood_overview,host_name,host_location,host_about,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,n_host_verifications
0,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...",Centrally located in the heart of Manhattan ju...,Jennifer,"New York, New York, United States",A New Yorker since 2000! My passion is creatin...,0.8,0.17,True,8.0,...,4.79,4.86,4.41,False,3,3,0,0,0.33,9
1,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",Just the right mix of urban center and local n...,LisaRoxanne,"New York, New York, United States",Laid-back Native New Yorker (formerly bi-coast...,0.09,0.69,True,1.0,...,4.8,4.71,4.64,False,1,1,0,0,4.86,6
2,"Spacious Brooklyn Duplex, Patio + Garden",We welcome you to stay in our lovely 2 br dupl...,,Rebecca,"Brooklyn, New York, United States","Rebecca is an artist/designer, and Henoch is i...",1.0,0.25,True,1.0,...,5.0,4.5,5.0,False,1,1,0,0,0.02,3
3,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"Theater district, many restaurants around here.",Shunichi,"New York, New York, United States",I used to work for a financial industry but no...,1.0,1.0,True,1.0,...,4.42,4.87,4.36,False,1,0,1,0,3.68,4
4,Cozy Clean Guest Room - Family Apt,"Our best guests are seeking a safe, clean, spa...",Our neighborhood is full of restaurants and ca...,MaryEllen,"New York, New York, United States",Welcome to family life with my oldest two away...,,,True,1.0,...,4.95,4.94,4.92,False,1,0,1,0,0.87,7


### Part 1.2: One hot encoding

In [26]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder
to_encode = list(df.select_dtypes(include=['object']).columns)
df[to_encode].nunique()

name                            27386
description                     25952
neighborhood_overview           15800
host_name                        7566
host_location                    1364
host_about                      11962
neighbourhood_group_cleansed        5
room_type                           4
amenities                       25020
dtype: int64

In [27]:
# remove columns with too many different values
columns_to_remove=['name', 'description', 'neighborhood_overview', 'host_about', 'amenities', 'host_name', 'host_location']
df.drop(columns_to_remove, axis=1, inplace=True)
to_encode = list(df.select_dtypes(include=['object']).columns)
df[to_encode].nunique()

neighbourhood_group_cleansed    5
room_type                       4
dtype: int64

In [28]:
# Create the encoder:
encoder = OneHotEncoder(handle_unknown="error", sparse=False)

# Apply the encoder:
df_enc = pd.DataFrame(encoder.fit_transform(df[to_encode]))

# Reinstate the original column names:
df_enc.columns = encoder.get_feature_names_out(to_encode)



In [29]:
df_enc.head()

Unnamed: 0,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
df = df.join(df_enc)
df.drop(to_encode, axis=1, inplace=True)
print(df.shape)
df.head()

(28022, 50)


Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,accommodates,bathrooms,bedrooms,...,n_host_verifications,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,0.8,0.17,True,8.0,8.0,True,True,1,1.0,,...,9,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.09,0.69,True,1.0,1.0,True,True,3,1.0,1.0,...,6,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.25,True,1.0,1.0,True,True,4,1.5,2.0,...,3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,True,1.0,1.0,True,True,2,1.0,1.0,...,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,,,True,1.0,1.0,True,True,1,1.0,1.0,...,7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Part 1.3: Replace outliers

In [31]:
# replace outliers of price
import scipy.stats as stats
df['label_price'] = stats.mstats.winsorize(df['price'], limits=[0.01, 0.01])
len(np.unique(df['label_price'] - df['price']))

28

###Part 1.4: Replace missing values with means

In [32]:
# replace missing values with means
nan_count = np.sum(df.isnull(), axis=0)
nan_detect= nan_count != 0
is_numeric_dtype = (df.dtypes=='int64') | (df.dtypes=='float64')
calculate_ave = nan_detect & is_numeric_dtype
calculate_ave_cols = df.columns[calculate_ave]
print(calculate_ave_cols)


Index(['host_response_rate', 'host_acceptance_rate', 'bedrooms', 'beds'], dtype='object')


In [33]:
to_impute_selected = ['host_response_rate', 'host_acceptance_rate', 'bedrooms', 'beds']
for col in to_impute_selected:
  col_average = df[col].mean()
  df[col].fillna(value=col_average, inplace=True)
for colname in to_impute_selected:
  print("{} missing values count :{}".format(colname, np.sum(df[colname].isnull(), axis = 0)))

host_response_rate missing values count :0
host_acceptance_rate missing values count :0
bedrooms missing values count :0
beds missing values count :0


###Part 1.5: Identify and delete features with small correlation with the label

In [34]:
# Identify Features With the Highest Correlation With the Label
corrs = df.corr()['label_price']
corrs_sorted = corrs.sort_values(ascending=True)
corrs_sorted

room_type_Private room                         -0.364792
neighbourhood_group_cleansed_Queens            -0.135177
neighbourhood_group_cleansed_Brooklyn          -0.113415
minimum_nights                                 -0.082053
neighbourhood_group_cleansed_Bronx             -0.073648
minimum_minimum_nights                         -0.073076
room_type_Shared room                          -0.050696
calculated_host_listings_count_shared_rooms    -0.046807
calculated_host_listings_count_private_rooms   -0.045742
neighbourhood_group_cleansed_Staten Island     -0.037704
number_of_reviews                              -0.033141
n_host_verifications                           -0.025404
minimum_nights_avg_ntm                         -0.010942
maximum_minimum_nights                         -0.007278
review_scores_value                            -0.004888
review_scores_checkin                          -0.003423
maximum_nights                                 -0.001023
review_scores_communication    

In [35]:
# delete colums without big influence on price
small_influence = corrs_sorted < 0.01

delete_cols = list(df.columns[small_influence])

# delete influence is NaN
delete_cols.extend(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified'])
delete_cols

['host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'host_is_superhost',
 'host_has_profile_pic',
 'host_identity_verified']

In [36]:
df.drop(delete_cols, axis=1, inplace=True)

# **Part2: Train Linear Model**

In [37]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve

In [38]:
y = df['label_price']
X = df.drop(columns=['label_price'], axis=1)
print(X.shape)
print(y.shape)

(28022, 31)
(28022,)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
X_train.head()

Unnamed: 0,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,...,n_host_verifications,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
16860,28.0,True,18,46,50,50,5,3,3,5.0,...,6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17993,31.0,True,0,0,0,0,4,0,0,5.0,...,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5214,1125.0,True,5,23,45,251,225,40,4,4.79,...,7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2220,1125.0,False,0,0,0,0,10,0,0,4.9,...,4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16547,1125.0,True,0,0,0,0,16,2,0,4.56,...,8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


###Part 2.1: Grid Search

In [40]:
from sklearn.svm import l1_min_c
C_set = l1_min_c(X_train, y_train, loss="log") * np.logspace(0, 7, 16)
max_iter_set = [i for i in range(500,2000,500)]
param_grid = dict(C=C_set, max_iter=max_iter_set)

In [None]:
print('Running Grid Search...')
lor_model = LogisticRegression()
grid = GridSearchCV(lor_model, param_grid, cv=5)
grid_search = grid.fit(X_train, y_train)
print("Done")

Running Grid Search...




In [None]:
C_best = grid_search.best_params_['C']
max_iter_best = grid_search.best_params_['max_iter']

###Part 2.3: Fit best model

In [None]:
logistic_model_best_params = LogisticRegresion(C = C_best, max_iter = max_iter_best)
logistic_model_best_params.fit(X_train, y_train)
y_logistic_model_best_params_pred = logistic_model_best_params.predict(X_test)

###Part 2.4: Evaluate

In [None]:
logistic_model_best_params_rmse = mean_squared_error(y_test, y_logistic_model_best_params_pred, squared = False)
logistic_model_best_params_r2 = r2_score(y_test, y_logistic_model_best_params_pred)

print('[RF] Root Mean Squared Error: {0}'.format(logistic_model_best_params_rmse))
print('[RF] R2: {0}'.format(logistic_model_best_params_r2))

#**Part 3: Try and compare single model and ensemble models**

###Part 3.1: Linear model

In [None]:
from sklearn.linear_model import LinearRegression
# 1. Create the model object below and assign to variable 'lr_model'
lr_model = LinearRegression()

# 2. Fit the model to the training data below
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_lr_pred = lr_model.predict(X_test)

lr_rmse = mean_squared_error(y_test, y_lr_pred, squared=False)
lr_r2 = r2_score(y_test, y_lr_pred)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

###Part 3.2: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
 # Grid Search
print("Grid searching ...")
dt_param_grid = {'max_depth':[i for i in range (4,8,2)], 'min_samples_leaf':[i for i in range (30, 50, 5)]}
dt_model = DecisionTreeRegressor()
dt_grid = GridSearchCV(dt_model, param_grid, cv=5)
dt_grid_search = dt_grid.fit(X_train, y_train)
print("Finish grid search")

dt_best_max_depth = dt_grid_search.best_params_['max_depth']
dt_best_min_samples_leaf = dt_grid_search.best_params['min_samples_leaf']
dt_best_model = DecisionTreeRegressor(max_depth = dt_best_max_depth, min_samples_leaf = dt_best_min_samples_leaf)
print("Training dt_best_model")
dt_best_model.fit(X_train, y_train)
y_dt_pred = dt_best_model.predict(X_test)

dt_rmse = mean_squared_error(y_test, y_dt_pred, squared=False)
dt_r2 = r2_score(y_test, y_dt_pred)

print('[LR] Root Mean Squared Error: {0}'.format(dt_rmse))
print('[LR] R2: {0}'.format(dt_r2))

###Part 3.3: Stacking model

In [None]:
from sklearn.ensemble import StackingRegressor
estimators = [("DT", DecisionTreeRegressor()),
              ("RF", RandomForestRegressor()),
              ("GBDT", GradientBoostingRegressor()),
              ("LR", LinearRegression())
             ]
stacking_model = StackingRegressor(estimators = estimators, cv = 5, passthrough=False)
print('Performing Cross-Validation...')

rmse_avg = np.mean(-1*cross_val_score(stacking_model, X_train, y_train, cv = 3, scoring = 'neg_root_mean_squared_error'))

print('End')
print('average score: {}'.format(rmse_avg))

In [None]:
params = {
    "DT__max_depth": [2, 4, 8],
    "GBDT__n_estimators":[100,300]
}

stack_grid = GridSearchCV(stacking, params, cv=3, verbose=4, scoring='neg_root_mean_squared_error', refit=True, n_jobs=-1)
stack_grid.fit(X_train, y_train)
print(stack_grid.best_params_)
rf_grid.cv_results_['mean_test_score']


print("best parameters:", rf_grid.best_params_)

rmse_stack_cv = -1*rf_grid.best_score_
print("[STACK] RMSE for the best model is : {:.2f}".format(rmse_stack_cv))