In [1]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd
import yaml

In [2]:
%run ./src/database.py
%run ./src/util.py
# %run ./src/data_preprocessing.py

In [3]:
YAML_FILEPATHNAME = "./config.yaml"
DATA_PATH = "./data/"
PRE_CRUISE_DB = 0
POST_CRUISE_DB = 1

# Preprocessing

In [4]:
# Read YAML file
yaml_data = read_yaml(YAML_FILEPATHNAME)

In [5]:
# Read Pre_cruise Data
ds_pre_cruise = Database(DATA_PATH)
df_pre_cruise = ds_pre_cruise.db_read(yaml_data['databases'][PRE_CRUISE_DB])

In [6]:
# Read Post_cruise Data
ds_post_cruise = Database(DATA_PATH)
df_post_cruise = ds_post_cruise.db_read(yaml_data['databases'][POST_CRUISE_DB])

In [7]:
# Merge Pre_cruise and Post_cruise data
df_cruise = merge_dataframe (df_pre_cruise, df_post_cruise)

In [8]:
df_cruise.shape

(130875, 25)

In [9]:
### There are 130875 rows and 25 columns

In [10]:
df_cruise.head(5)

Unnamed: 0_level_0,Gender,Date of Birth,Source of Traffic,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,...,Onboard Service,Cleanliness,Ext_Intcode_x,Cruise Name,Ticket Type,Cruise Distance,Ext_Intcode_y,WiFi,Dining,Entertainment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,05/10/1973,Direct - Company Website,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,...,2.0,3.0,LB446RWOOZI,Blastoise,,3567 KM,LB446RWOOZI,1.0,1,1.0
1,Female,,Indirect - Social Media,Not at all important,4.0,1.0,,01/01/2023 0:01,Very important,,...,4.0,4.0,LB138HKBECM,Blastoise,Deluxe,672 KM,LB138HKBECM,,0,1.0
2,Female,22/07/1998,Indirect - Search Engine,,3.0,0.0,5.0,01/01/2023 0:02,,,...,3.0,,BL713UHBAAN,IAPRAS,Deluxe,1167 KM,BL713UHBAAN,,0,0.0
3,Female,01/05/1970,Direct - Company Website,Very important,4.0,4.0,4.0,01/01/2023 0:05,Somewhat important,4.0,...,2.0,4.0,LB243DMKCFL,Lapras,Deluxe,280 KM,LB243DMKCFL,,0,1.0
4,Male,07/01/1960,Direct - Company Website,Somewhat important,4.0,2.0,,01/01/2023 0:06,Not at all important,2.0,...,2.0,,LB218CFLOBS,Lapras,Standard,1145 Miles,LB218CFLOBS,,1,


In [11]:
# Split Source of Traffic into Source and Traffic
split_column(df_cruise, 'Source of Traffic', ['Source', 'Traffic'], ' - ')

In [12]:
# Split Source of Traffic into Source and Traffic
split_column(df_cruise, 'Cruise Distance', ['Distance', 'Dist_Matrix'], ' ')
df_cruise['Distance'] = pd.to_numeric(df_cruise['Distance'], errors='coerce').astype('Int32')

In [13]:
# Impute Missing Value in dataframe
print(df_cruise.shape)
impute_missing_value(df_cruise)
print(df_cruise.shape)

(130875, 27)
(5846, 27)


In [14]:
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5846 entries, 26 to 133743
Data columns (total 27 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Gender                                      5846 non-null   object 
 1   Date of Birth                               5846 non-null   object 
 2   Onboard Wifi Service                        5846 non-null   object 
 3   Embarkation/Disembarkation time convenient  5846 non-null   float64
 4   Ease of Online booking                      5846 non-null   float64
 5   Gate location                               5846 non-null   float64
 6   Logging                                     5846 non-null   object 
 7   Onboard Dining Service                      5846 non-null   object 
 8   Online Check-in                             5846 non-null   float64
 9   Cabin Comfort                               5846 non-null   float64
 10  Onboard Entert

In [15]:
%run ./src/util.py
df_cruise.head()

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,...,Cruise Name,Ticket Type,Ext_Intcode_y,WiFi,Dining,Entertainment,Source,Traffic,Distance,Dist_Matrix
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26,Male,12/12/1964,Not at all important,1.0,4.0,1.0,01/01/2023 0:49,Very important,3.0,3.0,...,lap,Luxury,LB994CFCVQZ,0.0,0,1.0,Direct,Email Marketing,331,Miles
27,Male,02/09/1978,Very important,4.0,4.0,4.0,01/01/2023 0:51,Extremely important,4.0,4.0,...,Blastoise,Luxury,BL477XQQDPF,1.0,1,0.0,Direct,Email Marketing,2689,Miles
60,Female,09/10/1972,Very important,5.0,4.0,4.0,01/01/2023 1:52,Somewhat important,4.0,5.0,...,Blastoise,Luxury,BL144HPQZSS,1.0,1,1.0,Direct,Email Marketing,3874,KM
129,Male,14/10/1951,Somewhat important,3.0,3.0,3.0,01/01/2023 5:11,Somewhat important,4.0,5.0,...,Blastoise,Luxury,BL567ACJRBR,1.0,0,1.0,Direct,Company Website,427,KM
130,Male,15/05/1970,Not at all important,1.0,1.0,1.0,01/01/2023 5:20,A little important,3.0,4.0,...,Blastoise,Luxury,LB445GXZYVF,1.0,0,0.0,Direct,Company Website,-2139,KM


In [16]:
# Convert non-numeric cateogoric columns to numeric by LabelEncoder
list_non_numeric_col = ['Gender','Onboard Wifi Service','Onboard Dining Service','Onboard Entertainment','Cruise Name',
                            'Ticket Type','Source','Traffic','Dist_Matrix']
label_encoder(df_cruise,list_non_numeric_col)

In [17]:
# Convert Date of Birth to Year of Birth
# Convert Logging to Year of Logging
convert_datetime_to_year(df_cruise, ['Date of Birth','Logging'],['Year of Birth','Year Logging'],['%d/%m/%Y','%d/%m/%Y %H:%M'])

In [18]:
#Remove ID columns Ext_Intcode_x and Ext_Intcode_y 
remove_col(df_cruise,['Ext_Intcode_x', 'Ext_Intcode_y'])

In [19]:
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5846 entries, 26 to 133743
Data columns (total 25 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Gender                                      5846 non-null   int32  
 1   Onboard Wifi Service                        5846 non-null   int32  
 2   Embarkation/Disembarkation time convenient  5846 non-null   float64
 3   Ease of Online booking                      5846 non-null   float64
 4   Gate location                               5846 non-null   float64
 5   Onboard Dining Service                      5846 non-null   int32  
 6   Online Check-in                             5846 non-null   float64
 7   Cabin Comfort                               5846 non-null   float64
 8   Onboard Entertainment                       5846 non-null   int32  
 9   Cabin service                               5846 non-null   float64
 10  Baggage handli

# Feature Engineering

# Model Building

# Model Evaluation