In [45]:
# !pip install pydotplus

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import numpy as np
import pandas as pd
import yaml
import pydotplus
import ast

In [47]:
%run ./src/database.py
%run ./src/util.py
%run ./src/data_preprocessing.py
%run ./src/feature_engineering.py
%run ./src/visualisation.py
%run ./src/model_build.py

In [48]:
YAML_FILEPATHNAME = "./config.yaml"
DATA_PATH =  "./data/"
PRE_CRUISE_DB = 0
POST_CRUISE_DB = 1
IS_NOTEBOOK = True

# Read yaml config and data

In [49]:
# Read YAML file
yaml_data = read_yaml(YAML_FILEPATHNAME)
DISPLAY_STUB = yaml_data['display_stub']
TEST_SIZE = yaml_data['test_size']
RANDOM_STATE = yaml_data['random_state']
TARGET_VARIABLE = yaml_data['target_variable']
DB_INFO = yaml_data['databases']
COMPOSITE_FIELD_INFO = yaml_data['composite_fields_to_split']
ID_FIELDS = ast.literal_eval(yaml_data['ID_columns'])
DATETIME_FIELD_INFO = yaml_data['convert_obj_datetime']
NUMERIC_FIELD_INFO = yaml_data['convert_obj_numeric']
MISSING_VAL_THRESHOLD =  yaml_data['pct_missing_threshold']
CONTINUOUS_VARIABLE = ast.literal_eval(yaml_data['continuous_variables'])
DIRTY_DATA_INFO = yaml_data['dirty_data_setting']
VALID_DATA_INFO = yaml_data['valid_data_setting']
NON_NUMERIC_COL = yaml_data['non_numeric_cols']
DATE_YYYY_INFO = yaml_data['convert_date_yyyy']
IMPUTE_MISSING_VALUE_INFO = yaml_data['impute_missing_value']
OHE_FIELDS = ast.literal_eval(yaml_data['one_hot_encode'])
VERBOSE = yaml_data['verbose']
LR_HYPERPARAM = yaml_data['hyperparameters']['lr_param']
DTC_HYPERPARAM = yaml_data['hyperparameters']['dtc_param']
RFC_HYPERPARAM = yaml_data['hyperparameters']['rfc_param']
GBC_HYPERPARAM = yaml_data['hyperparameters']['gbc_param']

In [50]:
# Read Pre_cruise data
df_pre_cruise = db_read(DATA_PATH, DB_INFO[PRE_CRUISE_DB])

In [51]:
# Read Post_cruise data
df_post_cruise = db_read(DATA_PATH, DB_INFO[POST_CRUISE_DB])

In [52]:
# Merge Pre_cruise and Post_cruise to form df_cruise with Index as the key
df_cruise = db_merge_db (df_pre_cruise, df_post_cruise)

# Preprocessing

In [53]:
dp = DataProcessing()

In [54]:
# Replace all np.nan to None
print(df_cruise['Ease of Online booking'][df_cruise['Ease of Online booking'].isna()].head(3))
dp.replace_nan_none(df_cruise)
print(df_cruise['Ease of Online booking'][df_cruise['Ease of Online booking'].isna()].head(3))

index
8    NaN
9    NaN
10   NaN
Name: Ease of Online booking, dtype: float64
index
8     None
9     None
10    None
Name: Ease of Online booking, dtype: object


In [55]:
# Split composite field
dp.split_composite_field(df_cruise, COMPOSITE_FIELD_INFO)
df_cruise.head(5)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,...,Cruise Name,Ticket Type,Ext_Intcode_y,WiFi,Dining,Entertainment,Source,Traffic,Distance,Dist_Metrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,...,Blastoise,,LB446RWOOZI,1.0,1,1.0,Direct,Company Website,3567,KM
1,Female,,Not at all important,4.0,1.0,,01/01/2023 0:01,Very important,,4.0,...,Blastoise,Deluxe,LB138HKBECM,,0,1.0,Indirect,Social Media,672,KM
2,Female,22/07/1998,,3.0,0.0,5.0,01/01/2023 0:02,,,5.0,...,IAPRAS,Deluxe,BL713UHBAAN,,0,0.0,Indirect,Search Engine,1167,KM
3,Female,01/05/1970,Very important,4.0,4.0,4.0,01/01/2023 0:05,Somewhat important,4.0,4.0,...,Lapras,Deluxe,LB243DMKCFL,,0,1.0,Direct,Company Website,280,KM
4,Male,07/01/1960,Somewhat important,4.0,2.0,,01/01/2023 0:06,Not at all important,2.0,,...,Lapras,Standard,LB218CFLOBS,,1,,Direct,Company Website,1145,Miles


In [56]:
# Remove IDs columns as in general they will have no predictive power
dp.rm_id_cols(df_cruise, ID_FIELDS)
df_cruise.head(5)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,...,Cleanliness,Cruise Name,Ticket Type,WiFi,Dining,Entertainment,Source,Traffic,Distance,Dist_Metrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,...,3.0,Blastoise,,1.0,1,1.0,Direct,Company Website,3567,KM
1,Female,,Not at all important,4.0,1.0,,01/01/2023 0:01,Very important,,4.0,...,4.0,Blastoise,Deluxe,,0,1.0,Indirect,Social Media,672,KM
2,Female,22/07/1998,,3.0,0.0,5.0,01/01/2023 0:02,,,5.0,...,,IAPRAS,Deluxe,,0,0.0,Indirect,Search Engine,1167,KM
3,Female,01/05/1970,Very important,4.0,4.0,4.0,01/01/2023 0:05,Somewhat important,4.0,4.0,...,4.0,Lapras,Deluxe,,0,1.0,Direct,Company Website,280,KM
4,Male,07/01/1960,Somewhat important,4.0,2.0,,01/01/2023 0:06,Not at all important,2.0,,...,,Lapras,Standard,,1,,Direct,Company Website,1145,Miles


In [57]:
# Column pruning due to high missing values.
dp.rm_cols_high_missing(df_cruise,MISSING_VAL_THRESHOLD)
df_cruise.head(5)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,...,Port Check-in Service,Onboard Service,Cleanliness,Cruise Name,Ticket Type,Dining,Source,Traffic,Distance,Dist_Metrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,...,4.0,2.0,3.0,Blastoise,,1,Direct,Company Website,3567,KM
1,Female,,Not at all important,4.0,1.0,,01/01/2023 0:01,Very important,,4.0,...,4.0,4.0,4.0,Blastoise,Deluxe,0,Indirect,Social Media,672,KM
2,Female,22/07/1998,,3.0,0.0,5.0,01/01/2023 0:02,,,5.0,...,2.0,3.0,,IAPRAS,Deluxe,0,Indirect,Search Engine,1167,KM
3,Female,01/05/1970,Very important,4.0,4.0,4.0,01/01/2023 0:05,Somewhat important,4.0,4.0,...,3.0,2.0,4.0,Lapras,Deluxe,0,Direct,Company Website,280,KM
4,Male,07/01/1960,Somewhat important,4.0,2.0,,01/01/2023 0:06,Not at all important,2.0,,...,5.0,2.0,,Lapras,Standard,1,Direct,Company Website,1145,Miles


In [58]:
# Convert fields from Object to Datetime
dp.obj_to_datetime(df_cruise, DATETIME_FIELD_INFO)
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130875 entries, 0 to 133745
Data columns (total 23 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   Gender                                      117691 non-null  object        
 1   Date of Birth                               111830 non-null  datetime64[ns]
 2   Onboard Wifi Service                        111800 non-null  object        
 3   Embarkation/Disembarkation time convenient  115568 non-null  object        
 4   Ease of Online booking                      112945 non-null  object        
 5   Gate location                               114087 non-null  object        
 6   Logging                                     130875 non-null  datetime64[ns]
 7   Onboard Dining Service                      114404 non-null  object        
 8   Online Check-in                             115529 non-null  object       

In [59]:
# Convert fields from Object to Numeric 
dp.numeric_conversion(df_cruise, NUMERIC_FIELD_INFO)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,...,Port Check-in Service,Onboard Service,Cleanliness,Cruise Name,Ticket Type,Dining,Source,Traffic,Distance,Dist_Metrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,1973-10-05,A little important,3.0,5.0,3.0,2023-01-01 00:00:00,Very important,2.0,2.0,...,4.0,2.0,3.0,Blastoise,,1,Direct,Company Website,3567.0,KM
1,Female,NaT,Not at all important,4.0,1.0,,2023-01-01 00:01:00,Very important,,4.0,...,4.0,4.0,4.0,Blastoise,Deluxe,0,Indirect,Social Media,672.0,KM
2,Female,1998-07-22,,3.0,0.0,5.0,2023-01-01 00:02:00,,,5.0,...,2.0,3.0,,IAPRAS,Deluxe,0,Indirect,Search Engine,1167.0,KM
3,Female,1970-05-01,Very important,4.0,4.0,4.0,2023-01-01 00:05:00,Somewhat important,4.0,4.0,...,3.0,2.0,4.0,Lapras,Deluxe,0,Direct,Company Website,280.0,KM
4,Male,1960-01-07,Somewhat important,4.0,2.0,,2023-01-01 00:06:00,Not at all important,2.0,,...,5.0,2.0,,Lapras,Standard,1,Direct,Company Website,1145.0,Miles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133740,Male,1987-07-10,Somewhat important,3.0,3.0,3.0,2023-08-31 23:36:00,Somewhat important,,3.0,...,2.0,4.0,3.0,,Standard,1,Direct,Company Website,,
133741,Female,NaT,A little important,1.0,1.0,4.0,2023-08-31 23:38:00,,2.0,,...,3.0,5.0,,Blastoise,Standard,1,Indirect,Search Engine,1506.0,KM
133742,Female,1988-09-14,A little important,2.0,,2.0,2023-08-31 23:38:00,Somewhat important,3.0,2.0,...,3.0,2.0,,Blastoise,,0,Direct,Email Marketing,240.0,KM
133743,Male,2012-10-23,Extremely important,5.0,5.0,5.0,2023-08-31 23:41:00,Extremely important,5.0,5.0,...,4.0,4.0,5.0,Blastoise,Luxury,0,Direct,Email Marketing,-1947.0,KM


In [60]:
# Remove of rows from Target Variable 
print("There are " + str(df_cruise['Ticket Type'].isnull().sum()) + " missing value in Target Variables")
dp.rm_rows_target_var(df_cruise, TARGET_VARIABLE)
print("There are " + str(df_cruise['Ticket Type'].isnull().sum()) + " missing value in Target Variables")

There are 19976 missing value in Target Variables
There are 0 missing value in Target Variables


In [61]:
# Removal of rows from Continuous variable which has missing value 
print("Before removal - " + str(df_cruise.shape[0]))
dp.remove_missing(df_cruise,CONTINUOUS_VARIABLE)
print("After removal - " + str(df_cruise.shape[0]))

Before removal - 110899
After removal - 94747


In [62]:
# Dirty Data Cleansing
df_cruise['Cruise Name'].unique()
dp.dirty_data_processing(df_cruise, DIRTY_DATA_INFO)
df_cruise['Cruise Name'].unique()

array(['IAPRAS', 'Lapras', 'Blastoise', None], dtype=object)

In [63]:
# Restriction of Column vales
df_cruise['Cruise Name'].unique()
df_cruise = dp.valid_data_processing(df_cruise, VALID_DATA_INFO)
df_cruise['Cruise Name'].unique()

array(['Lapras', 'Blastoise', None], dtype=object)

In [64]:
df_cruise.info()
df_cruise = dp.impute_missing_value_info(df_cruise, IMPUTE_MISSING_VALUE_INFO)
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92716 entries, 3 to 133745
Data columns (total 23 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Gender                                      83373 non-null  object        
 1   Date of Birth                               92716 non-null  datetime64[ns]
 2   Onboard Wifi Service                        79190 non-null  object        
 3   Embarkation/Disembarkation time convenient  77737 non-null  object        
 4   Ease of Online booking                      76483 non-null  object        
 5   Gate location                               80810 non-null  object        
 6   Logging                                     92716 non-null  datetime64[ns]
 7   Onboard Dining Service                      81079 non-null  object        
 8   Online Check-in                             79939 non-null  object        
 9   Cabin

In [65]:
# Label encode non-numeric categorical columns to get numeric values
df_cruise = dp.label_encoder(df_cruise, NON_NUMERIC_COL)
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92716 entries, 3 to 133745
Data columns (total 23 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Gender                                      92716 non-null  int32         
 1   Date of Birth                               92716 non-null  datetime64[ns]
 2   Onboard Wifi Service                        92716 non-null  int32         
 3   Embarkation/Disembarkation time convenient  92716 non-null  float64       
 4   Ease of Online booking                      92716 non-null  float64       
 5   Gate location                               92716 non-null  float64       
 6   Logging                                     92716 non-null  datetime64[ns]
 7   Onboard Dining Service                      92716 non-null  int32         
 8   Online Check-in                             92716 non-null  float64       
 9   Cabin

# Feature Engineering

In [None]:
fe = feature_engineering(dp.get_dataframe())

In [None]:
# Derive year from date column to enhance model efficiency, mitigate noise
dp.yyyy_from_date(DATE_YYYY_INFO)

In [None]:
temp1 = fe.get_dataframe()

In [None]:
temp1.head(5)

In [None]:
fe.one_hot_key_encode(OHE_FIELDS)

In [None]:
# Standardise distance by converting Mile to KM
fe.convert_miles_to_KM("Distance")

In [None]:
# Derive Age from Year of Birth and Year of Logging
fe.calc_year_diff('Year of Logging', 'Year of Birth', 'Age')

# Model Building

## Logistic Regression

### Model Train

In [None]:
hyperparameter_dict = LR_HYPERPARAM
lr = Logistic_Regression(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, IS_NOTEBOOK)
lr.model_processing()

## Decision Tree Classifier

### RandomSearchCV

In [None]:
# hyperparameter_dict = {
# }
# dtc = Decision_Tree_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, 
#                                RANDOM_STATE, IS_NOTEBOOK)
# param_grid = {'max_depth':[9,10,12],
#               'criterion':['gini','entropy'],
#               'max_features': ['sqrt','log2'],
#               'min_samples_split':[2,4,6]
#              }
# dtc.RandomizedSearchCV(param_grid, VERBOSE)


### Model Train

In [None]:
hyperparameter_dict = DTC_HYPERPARAM
DecisionTreeClassifier(criterion='entropy', max_depth=15, max_features='sqrt',
                       min_samples_split=6)
dtc = Decision_Tree_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, IS_NOTEBOOK)
dtc_train = dtc.model_processing()

## Random Forest Classifier

### RandomSearchCV 

In [None]:
# hyperparameter_dict = {
# }
# rfc = Random_Forest_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, 
#                                IS_NOTEBOOK)
# # https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# param_grid = {
#       "n_estimators": [50, 100, 150, 200],
#         "criterion": ["gini", "entropy"],
#         "max_depth": [3, 5, 8],
#         "min_samples_split": [20, 24, 28],
#         "min_samples_leaf": [20, 24, 28],
#         "max_features": ["sqrt"],
#         "class_weight": ["balanced"]
#             }
# rfc.RandomizedSearchCV(param_grid, VERBOSE)

### Model Train

In [None]:
hyperparameter_dict = RFC_HYPERPARAM
rfc = Random_Forest_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, 
                               IS_NOTEBOOK)
rfc.model_processing()

## Gradient Boosting Classifier

### RandomSearchCV

In [None]:
# hyperparameter_dict = {
# }
# gbc = Gradient_Boosting_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, 
#                                    IS_NOTEBOOK)
# param_grid = {
#     "n_estimators": [50, 100, 150, 200],
#     "criterion": ["friedman_mse", "squared_error"],
#     "min_samples_split": [20, 24, 28],
#     "min_samples_leaf": [20, 24, 28],
#     "max_depth": [3, 5, 8]
#     }
# gbc.RandomizedSearchCV(param_grid, VERBOSE)

### Model Train

In [None]:
# hyperparameter_dict = GBC_HYPERPARAM
# gbc = Gradient_Boosting_Classifier(fe.get_dataframe(), TARGET_VARIABLE, hyperparameter_dict, TEST_SIZE, RANDOM_STATE, IS_NOTEBOOK)
# gbc.model_processing()

# Model Evaluation