# 1. IMPORTS

In [None]:
# Setup plotting
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


# to gnore warning

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Function for comparing different approaches
def score_dataset(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.75)
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


colorarr = ['#0592D0','#Cd7f32', '#E97451', '#Bdb76b', '#954535', '#C2b280', '#808000','#C2b280', '#E4d008', '#9acd32', '#Eedc82', '#E4d96f',
           '#32cd32','#39ff14','#00ff7f', '#008080', '#36454f', '#F88379', '#Ff4500', '#Ffb347', '#A94064', '#E75480', '#Ffb6c1', '#E5e4e2',
           '#Faf0e6', '#8c92ac', '#Dbd7d2','#A7a6ba', '#B38b6d']

# 2. DATA

## a) Load data

In [None]:
hotel = pd.read_csv('../input/hotels/hotel.csv')

X = hotel.copy()
y = X.pop('is_canceled')

X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

In [None]:
hotel['is_canceled']

## a) Descriptive Statistics

In [None]:
hotel.describe()

Here we do further investigation into the distribution of the target feature as well as other meaningful relationships amongst the features in the data.

## b) Data Cleaning

### i. Missing Values

We check to see if there are any missing values that we need to deal with in the data.

In [None]:
# Shape of X data (num_rows, num_columns)
print(X.shape)

# Number of missing values in each column of data
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

We see that there are columns with missing values, nameley, 'company', 'children', 'country', and 'agent'. We will drop this columns for now, but will consider other more effecient methods of dealing with the missing values for 'children', 'country', and 'agent'; since they have a small percentage of missing values, and could be having high mutual information scores.

In [None]:
# Get names of columns with missing values
cols_with_missing = [col for col in X.columns
                     if X[col].isnull().any()]

# Drop columns in training and validation data
reduced_X = X.drop(cols_with_missing, axis=1)


# Shape of X (num_rows, num_columns)
print(reduced_X.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (reduced_X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

After dropping the columns there are no missing values.

### ii. Categorical & Numerical Features

In [None]:
features_num = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]

# Select numerical columns
features_cat = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, features_num),
        ('cat', categorical_transformer, features_cat)
    ])

## c) Exploratory Data Analysis (EDA)

We use mutual information to obtain the feature importance of each of the columns. The columns with a higher mutual information score will remain in the data as they have more predictive ability, the others will be dropped.

In [None]:
from sklearn.feature_selection import mutual_info_regression


X = reduced_X.copy()

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()



# X_transformed = preprocessor.fit_transform(X)
discrete_features = X.dtypes == int
# discrete_features = features_num

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores # show a few features with their MI scores



def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

From the above plot we can see what features are more important for our model. We can see that the 'reservation_status' has a mutual info score greater than 0.5. We might want to investigate this feature to see it is highly correlated with the target feature, in which case, we will drop the feature because it consitutes to "data leakage".

We now plot the relationships of the target feature with other features. 

In [None]:
## over all distribution

plt.rcParams['figure.figsize'] = (15, 15)
plt.rcParams['figure.dpi'] = 150

# ratio of Nitrogen content in soil
plt.subplot(4, 2, 1)
sns.histplot(X['reservation_status'], color='greenyellow')
plt.title('Ratio of Nitrogen', fontsize = 12)
plt.tight_layout()
plt.grid()

# ratio of Phosphorous content in soil
plt.subplot(4, 2, 2)
sns.histplot(X['reservation_status_date'], color='firebrick')
plt.title('Ratio of Phosphorous', fontsize = 12)
plt.tight_layout()
plt.grid()

# ratio of Potassium content in soil
plt.subplot(4, 2, 3)
sns.histplot(X['deposit_type'], color = 'orange')
plt.title('Ratio of Potassium', fontsize = 12)
plt.tight_layout()
plt.grid()

# temperature in degree Celsius
plt.subplot(4, 2, 4)
sns.distplot(X['reservation_status'], color = 'lightcoral')
plt.title('Distripution of Temterature', fontsize = 12)
plt.tight_layout()
plt.grid()

# humidity - relative humidity in %
plt.subplot(4, 2, 5)
sns.distplot(X['reservation_status_date'], color = 'olivedrab')
plt.title('Distripution of Humidity', fontsize = 12)
plt.tight_layout()
plt.grid()

# ph - ph value of the soil
plt.subplot(4, 2, 6)
sns.distplot(X['deposit_type'], color = 'crimson')
plt.title('Distripution of Ph', fontsize = 12)
plt.tight_layout()
plt.grid()

# rainfall - rainfall in mm
plt.subplot(4, 2, 7)
sns.distplot(y, color = 'purple')
plt.title('Distripution of Rainfall', fontsize = 12)
plt.tight_layout()
plt.grid()

In [None]:
print(mi_scores[mi_scores > 0.5])
X = X.drop('reservation_status', axis = 1)

In [None]:
# Just create intractive chart function to make out process easy :AND: I am just lazy....
import random

def intractive_plot(df, feature, name):
    
    """
    This Function helps to create intractive Chart 
    ATTRIBUTE:
    df: original DataFrame
    feature: which column need to be 
    name : feature name
    """
    
    df_label = pd.pivot_table(df, index=['is_canceled'], aggfunc='mean')
    df_label_feature = df_label.sort_values(by=feature, ascending = False)
    
    fig = make_subplots(rows = 1, cols = 2)
    
    top = {
        
        'y': df_label_feature[feature][:10].sort_values().index,
        'x': df_label_feature[feature][:10].sort_values()
    }
    last = {
        
        'y': df_label_feature[feature][-10:].sort_values().index,
        'x': df_label_feature[feature][-10:].sort_values()
    }
    
    fig.add_trace(
        go.Bar(top,
               name='Least {} Needed'.format(name),
               marker_color = random.choice(colorarr),
               orientation = 'h',
               text = top['x']
              ),
        row = 1, col = 1
    )
    fig.add_trace(
        go.Bar(last,
               name='Least {} Needed'.format(name),
               marker_color = random.choice(colorarr),
               orientation = 'h',
               text = top['x']
              ),
        row = 1, col = 2
    )
    
    fig.update_traces(texttemplate = '%{text}', textposition = 'inside')
    fig.update_layout(title_text = name,
                      plot_bgcolor = 'white',
                      font_size = 12,
                      font_color = 'black',
                      height = 500
                     )


    fig.update_xaxes(showgrid = False)
    fig.update_yaxes(showgrid = False)
    fig.show()

In [None]:
hotel.columns

In [None]:
intractive_plot(hotel, 'lead_time', 'hotel')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 9), dpi=100)
sns.heatmap(hotel.corr(), annot = True, cmap = 'Blues')
ax.set(xlabel='features')
ax.set(ylabel='features')
plt.title('Correlation between different features', fontsize = 15, c='black')
# plt.tight_layout()
plt.show()

## d) Feature Engineering

In [None]:
features_num = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]

# Select numerical columns
features_cat = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, features_num),
        ('cat', categorical_transformer, features_cat)
    ])

## e) Data Splitting

In [None]:
# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, stratify=y, train_size=0.75)

# X_train = preprocessor.fit_transform(X_train)
# X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

# 3. MODELS

## a) Simple Base Model

### i. Define Model

In [None]:
rfc_model = RandomForestClassifier(n_estimators=100)

rfc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rfc_model)]
                       )

# Preprocessing of training data, fit model 
rfc_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
rfc_preds = rfc_pipeline.predict(X_valid)

# Evaluate the model
rfc_score = mean_absolute_error(y_valid, rfc_preds)
print("MAE RFC: ", rfc_score)

In [None]:
rfc_preds

### ii. Evaluate Model

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rfc_pipeline, X, y, cv=5, scoring='accuracy')

print("MAE scores:\n", scores)
print()
print("Average MAE score (across experiments):")
print(scores.mean())

## b) Model Selection (Classification & Regession Models)

### i. Define Model

In [None]:
from xgboost import XGBClassifier

# Master Parameters:
n_splits = 2 # Cross Validation Splits
scoring = 'accuracy' # Model Selection during Cross-Validation
rstate = 27 # Random State used 

# Boosting rounds
num_rounds = 100

# xgb_model = XGBClassifier(n_estimators=100)

xgb_model = XGBClassifier(n_estimators = num_rounds,
                      objective= 'binary:logistic',
                      learning_rate=0.01,
                      random_state=rstate)

xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                              ])

# Preprocessing of training data, fit model 
xgb_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
xgb_preds = xgb_pipeline.predict(X_valid)

# Evaluate the model
xgb_score = mean_absolute_error(y_valid, xgb_preds)
print("MAE RFC:\n", xgb_score)

In [None]:
xgb_preds

### ii. Evaluate Model

In [None]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores = cross_val_score(xgb_pipeline, X, y,
                              cv=5,
                              scoring=scoring)

print("Accuracy scores:\n", scores)
print()

print("Average accuracy score (across experiments):")
print(scores.mean())

## c) Neural Network Model

### i. Define Model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Define the model given in the diagram
model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    
    layers.Dense(256, activation='relu'), 
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.Dense(256, activation='relu'), 
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.Dense(1, activation='sigmoid'),
])


model.compile(
    optimizer='adam',
    loss="mse", 
    metrics=["mae", "acc"],
#     loss='binary_crossentropy',
#     metrics=['binary_accuracy'],
)


early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)



### ii. Evaluate Model

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")


final_loss, final_acc = model.evaluate(X_valid, y_valid, verbose=0)
print("Final loss: {0:.4f}, final accuracy: {1:.4f}".format(final_loss, final_acc))

## d) Chosen Model (Best Model)

### i. Define Model

In [None]:
chosen_model = model
dl_preds = model.predict(X_valid)
dl_preds

### ii. Evalute Model

# D. Results

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def Metrics_calculator(y_pred, y_test):
    accuracy = accuracy_score(y_pred, y_test)
    precision = precision_score(y_pred, y_test, pos_label='positive', average='micro')
    recall = recall_score(y_pred, y_test, pos_label='positive', average='micro')
    f1 = f1_score(y_pred, y_test, pos_label='positive', average='micro')
    
    return accuracy, precision, recall, f1

In [None]:
rfc_accuracy, rfc_precision, rfc_recall, rfc_f1 = Metrics_calculator(rfc_preds, y_valid)
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1 = Metrics_calculator(xgb_preds, y_valid)


In [None]:
# dl_accuracy, dl_precision, dl_recall, dl_f1 = Metrics_calculator(dl_preds, y_valid)
from sklearn.metrics import classification_report

# print(classification_report(y_valid, dl_preds, target_names=X_valid.columns))

In [None]:
print('Accuracy of the Random Forest Model is: ', rfc_accuracy)
print('Precision of the Random Forest Model is: ', rfc_precision)
print('Recall of the Random Forest Model is: ', rfc_recall)
print('F1 of the Random Forest Model is: ', rfc_f1)

In [None]:
## confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_valid,rfc_preds)

plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(accuracy_score(y_valid,rfc_preds))
plt.title(all_sample_title, size = 15);
plt.show()

In [None]:
print('Accuracy of the Random Forest Model is: ', xgb_accuracy)
print('Precision of the Random Forest Model is: ', xgb_precision)
print('Recall of the Random Forest Model is: ', xgb_recall)
print('F1 of the Random Forest Model is: ', xgb_f1)

In [None]:
## confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_valid, xgb_preds)

plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(accuracy_score(y_valid, xgb_preds))
plt.title(all_sample_title, size = 15);
plt.show()

1. Use different evaluation measures to rank the methods.
2. Plot confusion matrix of results.
3. Investigate Data Leakage i.e reservation_status
4. Model selection.
5. Plot relationships with the target variable.
6. Discuss results
7. Make predictions with Chosen Model.

8. Problem statement & Introduction