I have not yet run this neural network on a dataset that has been test/train split.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Import to dataframe
df = pd.read_csv('hotel_bookings.csv')

#There were two hotels in the data set, we will examine only the resort hotel, for increased accuracy.
df = df[df['hotel'] == 'Resort Hotel']

#The dataset is quite large, and I am getting memory errors. I will split it randomnly.
 #df = df.sample(frac=0.05, random_state=13)

#Dropping some data:
df = df.dropna(subset=['country'])

#Filling in some missing data:
df['agent'] = df['agent'].fillna(536.0)
df['company'] = df['company'].fillna(5.0)

#Cleaning Data:

#Getting Cancellation Frequencies for various variables:
df['customer_cancel_score'] = np.round(np.where(df['is_repeated_guest'] > 0, df['previous_bookings_not_canceled']/(df['previous_bookings_not_canceled'] + df['previous_cancellations']), 0), 1)

#Much of the data in this set is categorical, I will make useful numerical data out of it by calculating cancellation
#fre3quencies for each value of these categorical variables.
agent_df = pd.DataFrame(df[['is_canceled','agent']].groupby('agent', as_index=False).agg('mean'))
company_df = pd.DataFrame(df[['is_canceled','company']].groupby('company', as_index=False).agg('mean'))
country_df = pd.DataFrame(df[['is_canceled','country']].groupby('country', as_index=False).agg('mean'))

agent_df = agent_df.rename(index=str, columns={"is_canceled": "agent_cancel_score"})
company_df = company_df.rename(index=str, columns={"is_canceled": "company_cancel_score"})
country_df = country_df.rename(index=str, columns={"is_canceled": "country_cancel_score"})

df = pd.merge(df,agent_df,how='left',on=['agent'])
df = pd.merge(df,company_df,how='left',on=['company'])
df = pd.merge(df,country_df,how='left',on=['country'])

distribution_channel_df = pd.DataFrame(df[['is_canceled','distribution_channel']].groupby('distribution_channel', as_index=False).agg('mean'))
distribution_channel_df = distribution_channel_df.rename(index=str, columns={"is_canceled": "distribution_channel_score"})
df = pd.merge(df,distribution_channel_df,how='left',on=['distribution_channel'])

market_segment_df = pd.DataFrame(df[['is_canceled','market_segment']].groupby('market_segment', as_index=False).agg('mean'))
market_segment_df = market_segment_df.rename(index=str, columns={"is_canceled": "market_segment_score"})
df = pd.merge(df,market_segment_df,how='left',on=['market_segment'])  

meal_df = pd.DataFrame(df[['is_canceled','meal']].groupby('meal', as_index=False).agg('mean'))
meal_df = meal_df.rename(index=str, columns={"is_canceled": "meal_score"})
df = pd.merge(df,meal_df,how='left',on=['meal'])

deposit_type_df = pd.DataFrame(df[['is_canceled','deposit_type']].groupby('deposit_type', as_index=False).agg('mean'))
deposit_type_df = deposit_type_df.rename(index=str, columns={"is_canceled": "deposit_score"})
df = pd.merge(df,deposit_type_df,how='left',on=['deposit_type'])

arrival_date_week_number_df = pd.DataFrame(df[['is_canceled','arrival_date_week_number']].groupby('arrival_date_week_number', as_index=False).agg('mean'))
arrival_date_week_number_df = arrival_date_week_number_df.rename(index=str, columns={"is_canceled": "week_number_score"})
df = pd.merge(df,arrival_date_week_number_df,how='left',on=['arrival_date_week_number']) 

#Addressing some outliers:
df.lead_time = np.sqrt(np.sqrt(df.lead_time))
df = df[df['required_car_parking_spaces']<2]

df = df.reset_index()

#Purging unneeded columns
df_features = df[['is_canceled','agent_cancel_score','deposit_score','country_cancel_score','market_segment_score','lead_time','week_number_score','required_car_parking_spaces']]

In [3]:
df_features.dtypes

is_canceled                      int64
agent_cancel_score             float64
deposit_score                  float64
country_cancel_score           float64
market_segment_score           float64
lead_time                      float64
week_number_score              float64
required_car_parking_spaces      int64
dtype: object

In [4]:
# Loading the data 
X = df_features
Y = df_features.is_canceled

In [5]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(1000,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [6]:
mlp.score(X, Y)

1.0

In [7]:
Y.value_counts()/len(Y)

0    0.720052
1    0.279948
Name: is_canceled, dtype: float64

In [8]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([1., 1., 1., 1., 1.])