In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sns
import time
from sklearn import metrics


In [None]:
# feature enigineering 
train_df = pd.read_csv('train.csv')
train_df["train"] = 1
test_df = pd.read_csv('test.csv')
test_df["train"] = 0
df = pd.concat([train_df,test_df],sort = False)
# numeric encoding 
le = preprocessing.LabelEncoder()
data = df.copy()
# encoding catgorical to numeric
data["Host_response_time"] = le.fit_transform(df["Host_response_time"])
data["Host_is_superhost"] = le.fit_transform(df["Host_is_superhost"])
data["Host_has_profile_pic"] = le.fit_transform(df["Host_has_profile_pic"])
data["Host_identity_verified"] = le.fit_transform(df["Host_identity_verified"])
data["Month"] = le.fit_transform(df["Month"]) + 1
data["Price"] = pd.to_numeric(data["Price"].str.replace('[^\dA-Za-z]', '',regex = True))/100
# one-hot encoding
dummy_df = pd.get_dummies(df[["Neighbourhood","Room_type"]],columns=["Neighbourhood","Room_type"])
data = pd.concat([data,dummy_df],axis = 1)
data = data.drop(columns=["Neighbourhood","Room_type","Property_type"])
# create column with number of bathroom
data["Bathroom_number"] = pd.to_numeric(data["Bathrooms_text"].str.lower().str.replace("half","0.5").str.replace(r"[a-zA-Z,-]", '',regex = True))
# column indicating if this room has private bath
private_bath = data["Bathrooms_text"].str.find("private") + 1
data["Have_Private_Bath"] = private_bath 
data["Have_Private_Bath"][data["Have_Private_Bath"]>0] = 1
data = data.drop(columns=["Bathrooms_text"])
# replace true/false with 1/0
data.loc[data["Instant_bookable"] == "t","Instant_bookable"] = 1
data.loc[data["Instant_bookable"] == "f","Instant_bookable"] = 0
data["Instant_bookable"] = pd.to_numeric(data["Instant_bookable"])
# Get back the original train and test
train = data.loc[data["train"] == 1].drop(columns = ["id","train"])
test = data.loc[data["train"] == 0].drop(columns = ["id","train"])

# impute missing data with mean
train_data = train.drop(columns = ["Decision"])
test_data = test.drop(columns=["Decision"])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(train_data)
train_data = pd.DataFrame(imp_mean.transform(train_data))
test_data = pd.DataFrame(imp_mean.transform(test_data))
# make sure all columns are imputed
print(train_data[train_data.columns[train_data.isna().any()]].shape[1]," columns of train has NaN")
print(test_data[test_data.columns[test_data.isna().any()]].shape[1]," columns of test has NaN")



In [None]:
# correlations among predictors
corr = train_df.drop(columns = ["id","Decision","train"]).corr()
# create heatmap
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0,cmap="GnBu",square=True
)
# set size
sns.set(rc = {'figure.figsize':(5,5)})
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)




In [None]:
# Train test split on training set
X = train_data
y = np.array(train["Decision"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=671)

In [None]:

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
scale_train = scaler.transform(X_train)
scale_test = scaler.transform(X_test)

# svm 
start = time.time()
# parameter space
param_grid = {'C': [0.1,1,5,10,50,100],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(scale_train,y_train)
end = time.time()
print("running time is",end-start,"s")

In [None]:
# this is for visualization included in report
# hyperparameter only include C
param_grid = {'C': [0.1,1,5,10,50,100]}
# fix kernel = rbf
grid = GridSearchCV(SVC(kernel = "rbf"),param_grid,refit=True,verbose=2)
grid.fit(scale_train,y_train)

loss = grid.cv_results_["mean_test_score"]
plt.plot([0.1,1,5,10,50,100],loss)
plt.xlabel("C")
plt.ylabel("CV Loss")
plt.title("rbf" +  " CV LOSS")

# fix kernel = poly
grid = GridSearchCV(SVC(kernel = "poly"),param_grid,refit=True,verbose=2)
grid.fit(scale_train,y_train)
loss = grid.cv_results_["mean_test_score"]
plt.plot([0.1,1,5,10,50,100],loss)
plt.xlabel("C")
plt.ylabel("CV Loss")
plt.title("poly" +  " CV LOSS")

In [None]:
# CV results for best svm
print(grid.best_params_)
print(grid.best_score_)
best_prediction = grid.predict(scale_test)
accuracy_score(y_test,best_prediction)

In [None]:
# ROC for Best SVM
best_svm = SVC(C = 10,kernel='rbf').fit(scale_train,y_train)
metrics.plot_roc_curve(best_svm,scale_test, y_test) 

In [None]:
# xgboost
start = time.time()
xg_clf = xgb.XGBClassifier(objective ='binary:logistic',
                             eval_metric = "logloss",
                           use_label_encoder=False)
param_grid = {'n_estimators' : [20,50,100,150,200,250,300],
              'max_depth': range(1,30,2)
             }
grid = GridSearchCV(xg_clf,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

In [None]:
# best params for the 2 parameters above
print(grid.best_params_)

In [None]:
# set results above as fixed values
# keep tuning the other 3 parameters
xg_clf = xgb.XGBClassifier(objective ='binary:logistic',
                             eval_metric = "logloss",n_estimators = 150,
                           max_depth = 7,
                           use_label_encoder=False)
param_grid = {'learning_rate' : np.arange(0.01,0.2,0.01),
              'min_child_weight': range(1,10,1),
              'lambda' : np.arange(0.1,1.1,0.1)
             }
grid = GridSearchCV(xg_clf,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
end = time.time()
print("running time is",end-start,"s")

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# best estimate xgboost
best_xgb = xgb.XGBClassifier(objective ='binary:logistic',
                             eval_metric = "logloss",n_estimators = 150,
                           max_depth = 7,reg_lambda = 0.2,learning_rate = 0.17,min_child_weight = 1,
                           use_label_encoder=False)
best_xgb.fit(X_train,y_train)
preds = best_xgb.predict(X_test)
accuracy_score(y_test,preds)
# plot roc
metrics.plot_roc_curve(best_xgb,X_test, y_test) 

In [None]:
# this is for visualization included in report
# fix others; vary n estimators
param_grid = {'n_estimators' : [20,50,100,150,200,250,300]}
grid = GridSearchCV(xgb.XGBClassifier(objective ='binary:logistic',
                             eval_metric = "logloss",
                           max_depth = 7,reg_lambda = 0.2,learning_rate = 0.17,min_child_weight = 1,
                           use_label_encoder=False),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

loss = grid.cv_results_["mean_test_score"]
plt.plot([20,50,100,150,200,250,300],loss)
plt.xlabel("C")
plt.ylabel("CV Loss")
plt.title( "CV LOSS With N estimators")

In [None]:
# this is for visualization included in report
# fix others; vary n max_depth
param_grid = {'max_depth': range(1,30,2)}
grid = GridSearchCV(xgb.XGBClassifier(objective ='binary:logistic',
                             eval_metric = "logloss",n_estimators = 150,
                           reg_lambda = 0.2,learning_rate = 0.17,min_child_weight = 1,
                           use_label_encoder=False),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

loss = grid.cv_results_["mean_test_score"]
plt.plot([*range(1,30,2)],loss)
plt.xlabel("C")
plt.ylabel("CV Loss")
plt.title( "CV LOSS With Max Depth N")

In [None]:

# random forest
start = time.time()


RF_clf = RandomForestClassifier(random_state=671)
param_grid = {'max_depth': [10, 20, 30, 40, 50, None],
              'n_estimators': [10,50,100,150,200],
              'max_features': ['log2', 'sqrt',None]
             }
grid = GridSearchCV(RF_clf,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

end = time.time()
print("running time is",end-start,"s")

In [None]:
print(grid.best_params_)
print(grid.best_score_)
pred = grid.predict(X_test)
accuracy_score(pred,y_test)
# get best random forest
best_rf = RandomForestClassifier(random_state=671,max_depth = 20,max_features="sqrt",n_estimators = 150).fit(X_train,y_train)
# plot roc
metrics.plot_roc_curve(best_rf,X_test, y_test) 

