In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import GridSearchCV

In [2]:
# Import Data
df_train = pd.read_csv('./newDataset/train_reduce.csv')
df_test = pd.read_csv('./newDataset/test.csv')
df_label = pd.read_csv('./newDataset/adr_and_is_canceled.csv')
df_is_canceled = df_label['is_canceled']

train_sz = df_train.shape[0]
hotel_data = pd.concat([df_train, df_test])
hotel_data

Unnamed: 0,ID,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,0,Resort Hotel,342,2015,July,27,1,0,0,2,...,C,C,3,No Deposit,,,0,Transient,0,0
1,1,City Hotel,257,2015,July,27,1,0,2,1,...,A,A,1,No Deposit,6.0,,0,Transient,0,0
2,2,City Hotel,257,2015,July,27,1,0,2,2,...,A,A,0,No Deposit,6.0,,0,Transient,0,0
3,3,City Hotel,257,2015,July,27,1,0,2,2,...,A,A,0,No Deposit,6.0,,0,Transient,0,0
4,4,City Hotel,257,2015,July,27,1,0,2,2,...,A,A,0,No Deposit,6.0,,0,Transient,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27854,119385,Resort Hotel,108,2017,August,35,31,2,5,2,...,E,E,0,No Deposit,241.0,,0,Transient,0,1
27855,119386,Resort Hotel,194,2017,August,35,31,2,5,2,...,G,G,3,No Deposit,240.0,,0,Transient,1,1
27856,119387,Resort Hotel,17,2017,August,35,31,0,3,2,...,A,A,0,No Deposit,240.0,,0,Transient,0,2
27857,119388,Resort Hotel,191,2017,August,35,31,2,5,2,...,D,D,0,No Deposit,40.0,,0,Contract,0,0


In [3]:
# Creating new feature: `Weekday vs Weekend` 
pd.options.mode.chained_assignment = None
def week_function(feature1, feature2, data_source):
    data_source['weekend_or_weekday'] = 0
    for i in range(0, len(data_source)):
        if feature2.iloc[i] == 0 and feature1.iloc[i] > 0:
            hotel_data['weekend_or_weekday'].iloc[i] = 'stay_just_weekend'
        if feature2.iloc[i] > 0 and feature1.iloc[i] == 0:
            hotel_data['weekend_or_weekday'].iloc[i] = 'stay_just_weekday'
        if feature2.iloc[i] > 0 and feature1.iloc[i] > 0:
            hotel_data['weekend_or_weekday'].iloc[i] = 'stay_both_weekday_and_weekend'
        if feature2.iloc[i] == 0 and feature1.iloc[i] == 0:
            hotel_data['weekend_or_weekday'].iloc[i] = 'undefined_data'

            
week_function(hotel_data['stays_in_weekend_nights'],hotel_data['stays_in_week_nights'], hotel_data)

In [4]:
# Create new feature:`all_children` with merge children and baby features

hotel_data['all_children'] = hotel_data['children'] + hotel_data['babies']

In [5]:
# Fill missing data

hotel_data['children'] =  hotel_data['children'].fillna(0)
hotel_data['all_children'] =  hotel_data['all_children'].fillna(0)
hotel_data['country'] = hotel_data['country'].fillna(hotel_data['country'].mode().index[0])
hotel_data['agent']= hotel_data['agent'].fillna('0')
hotel_data=hotel_data.drop(['company'], axis =1)

In [6]:
# Change data structure

hotel_data['agent']= hotel_data['agent'].astype(int)
hotel_data['country']= hotel_data['country'].astype(str)

In [7]:
#Using Label Encoder method for categorical features

labelencoder = LabelEncoder()
hotel_data['hotel'] = labelencoder.fit_transform(hotel_data['hotel'])
hotel_data['arrival_date_month'] = labelencoder.fit_transform(hotel_data['arrival_date_month'])
hotel_data['meal'] = labelencoder.fit_transform(hotel_data['meal'])
hotel_data['country'] = labelencoder.fit_transform(hotel_data['country'])
hotel_data['market_segment']= labelencoder.fit_transform(hotel_data['market_segment'])
hotel_data['distribution_channel']=labelencoder.fit_transform(hotel_data['distribution_channel'])
hotel_data['is_repeated_guest'] = labelencoder.fit_transform(hotel_data['is_repeated_guest'])
hotel_data['reserved_room_type'] = labelencoder.fit_transform(hotel_data['reserved_room_type'])
hotel_data['assigned_room_type'] = labelencoder.fit_transform(hotel_data['assigned_room_type'])
hotel_data['deposit_type'] = labelencoder.fit_transform(hotel_data['deposit_type'])
hotel_data['agent'] = labelencoder.fit_transform(hotel_data['agent'])
hotel_data['customer_type'] = labelencoder.fit_transform(hotel_data['customer_type'])
hotel_data['weekend_or_weekday'] = labelencoder.fit_transform(hotel_data['weekend_or_weekday'])

In [8]:
#Dropping some features from data

hotel_data.drop(['children'], axis=1, inplace=True)

In [9]:
# Drop `baby` feature from data

hotel_data.drop(['babies'], axis=1, inplace=True)

In [10]:
# Split to train and test

X_train, y_train = hotel_data.iloc[0:train_sz, :], df_is_canceled
X_test = hotel_data.iloc[train_sz:, :]

In [11]:
# Implement standart scaler method

standardScalerX = StandardScaler()
X_train = standardScalerX.fit_transform(X_train)
X_test = standardScalerX.transform(X_test)

In [18]:
# grid search
# Finding parameters for RF model

model_rfc_gs = RandomForestClassifier()
parameters_rfc = {
    'n_estimators' : [40,60,80,100],
    'min_samples_split' : [2,4,6,8],
    'min_samples_leaf': [1,2,4,6]
}

grid_search_rfc = GridSearchCV(estimator=model_rfc_gs, param_grid=parameters_rfc, \
                           cv=5, scoring='f1', verbose=True, n_jobs=-1)
grid_search_rfc.fit(X_train, y_train)
grid_search_rfc.best_params_

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  3.4min finished


{'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 40}

In [21]:
rf_model = RandomForestClassifier(min_samples_leaf = 6, min_samples_split=2,
                                  n_estimators = 40)

# fit the model
estimator= rf_model.fit(X_train, y_train)

In [22]:
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)
print(accuracy_score(y_train, y_pred_train))

0.9287891534015797


In [23]:
df_pred = pd.DataFrame(y_pred_test, columns = ['is_canceled'])
df_pred.to_csv("is_canceled_pred.csv", index=False)

In [24]:
sum(y_pred_test == True) / y_pred_test.shape[0]

0.3086255788075667

In [25]:
sum(y_pred_train == True) / y_pred_train.shape[0]

0.3244802307414974