# Libraries 📖

In [8]:
import yaml

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score


In [10]:
%run ./src/data_preprocessing.py

# Load data 📁

In [11]:
with open('./config.yaml', 'r') as yaml_file:
    config_data = yaml.safe_load(yaml_file)

In [12]:
dp = DataPreprocessing()
dp.load_data(config_data['yaml_path'], config_data['databases'])
cruise_data = dp.get_merged_data()

In [13]:
TARGET_VARIABLE = config_data['target_variable']
TEST_SIZE=0.25
RANDOM_STATE=42

# Exploratory Data Analysis 📊

### Exploratory on Overall Shape of Dataset

In [19]:
cruise_data.shape

(401261, 25)

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 01:
    <li>There are a relative big dataset of 401261 rows with 25 columns<ul>
</div>


In [70]:
cruise_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 401261 entries, 0 to 133745
Data columns (total 25 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Gender                                      360890 non-null  object 
 1   Date of Birth                               357208 non-null  object 
 2   Source of Traffic                           401261 non-null  object 
 3   Onboard Wifi Service                        342780 non-null  object 
 4   Embarkation/Disembarkation time convenient  354330 non-null  float64
 5   Ease of Online booking                      346192 non-null  float64
 6   Gate location                               349835 non-null  float64
 7   Logging                                     401261 non-null  object 
 8   Onboard Dining Service                      350830 non-null  object 
 9   Online Check-in                             354193 non-null  float64
 10  C

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 02:
    <li>With reference to above and that of Observation 01 of 401261 rows, there are missing values in the following variables. These missing values needed to be imputed.
        <ul>
            <li>Gender</li>
            <li>Date of Birth</li>
            <li>Onboard Wifi Service</li>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Ease of Online booking</li>
            <li>Gate location</li>
            <li>Onboard Dining Service</li>
            <li>Online Check-in</li>
            <li>Cabin Comfort</li>
            <li>Onboard Entertainment</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
            <li>Cruise Name</li>
            <li>Ticket Type</li>
            <li>Cruise Distance</li>
            <li>WiFi</li>
            <li>Entertainment</li>
        </ul>
    </li>
</div>

In [144]:
def print_missing_value(dataframe):
    print('Column' +' '*(45 -len("Column")) + "Number (Percentage)")
    for column in dataframe.columns:
        missing_values_count = dataframe[column].isnull().sum()
        missing_value_percent = missing_values_count/dataframe.shape[0] * 100
        if missing_values_count !=0:
            print('{}'.format(column) +' '*(45 -len(column)) + "{} ({:.2f}%)".format(missing_values_count, missing_value_percent))    

In [145]:
# I like to know the percentage of missing data over the total number of rows
print_missing_value(cruise_data)

Column                                       Number (Percentage)
Gender                                       40371 (10.06%)
Date of Birth                                44053 (10.98%)
Onboard Wifi Service                         58481 (14.57%)
Embarkation/Disembarkation time convenient   46931 (11.70%)
Ease of Online booking                       55069 (13.72%)
Gate location                                51426 (12.82%)
Onboard Dining Service                       50431 (12.57%)
Online Check-in                              47068 (11.73%)
Cabin Comfort                                60758 (15.14%)
Onboard Entertainment                        47744 (11.90%)
Cabin service                                41499 (10.34%)
Baggage handling                             60883 (15.17%)
Port Check-in Service                        43044 (10.73%)
Onboard Service                              54983 (13.70%)
Cleanliness                                  55211 (13.76%)
Cruise Name                        

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 03:
    <li>With reference to above
        <ul>
            <li>Most of the missing values ranging upon 10 above percent</li>
            <li>There are 2 columns (WiFi and Entertainment) more than 40%, I will be removing them.</li>
        </ul>
    </li>
</div>

In [76]:
# Removal of Wifi and Entertainment
cruise_data = cruise_data.drop(['WiFi', 'Entertainment'], axis=1)

In [16]:
# import util as utl
# output_csv = utl.output_csv("./data/", cruise_data,"data.csv")

In [81]:
pd.set_option('display.max_columns', None)
cruise_data.head(5)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,Cabin service,Baggage handling,Port Check-in Service,Onboard Service,Cleanliness,Ext_Intcode_x,Cruise Name,Ticket Type,Ext_Intcode_y,Dining,Category,Source,Distance,Unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,A little important,2.0,2.0,4.0,2.0,3.0,LB446RWOOZI,Blastoise,,LB446RWOOZI,1,Direct,Company Website,3567,KM
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,A little important,2.0,2.0,4.0,2.0,3.0,LB446RWOOZI,Blastoise,,LB446RWOOZI,1,Direct,Company Website,3567,KM
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,A little important,2.0,2.0,4.0,2.0,3.0,LB446RWOOZI,Blastoise,,LB446RWOOZI,1,Direct,Company Website,3567,KM
0,,05/10/1973,A little important,3.0,5.0,3.0,01/01/2023 0:00,Very important,2.0,2.0,A little important,2.0,2.0,4.0,2.0,3.0,LB446RWOOZI,Blastoise,,LB446RWOOZI,1,Direct,Company Website,3567,KM
1,Female,,Not at all important,4.0,1.0,,01/01/2023 0:01,Very important,,4.0,,2.0,3.0,4.0,4.0,4.0,LB138HKBECM,Blastoise,Deluxe,LB138HKBECM,0,Indirect,Social Media,672,KM


In [83]:
cruise_data.tail(1000)

Unnamed: 0_level_0,Gender,Date of Birth,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,Cabin service,Baggage handling,Port Check-in Service,Onboard Service,Cleanliness,Ext_Intcode_x,Cruise Name,Ticket Type,Ext_Intcode_y,Dining,Category,Source,Distance,Unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
133412,Male,13/06/1986,A little important,2.0,2.0,,31/08/2023 7:41,,,3.0,Very important,5.0,5.0,4.0,5.0,4.0,LB677RIXOVH,Blastoise,Luxury,LB677RIXOVH,1,Indirect,Social Media,1379,KM
133413,,28/02/2015,A little important,1.0,2.0,2.0,31/08/2023 7:41,A little important,5.0,,Very important,3.0,2.0,5.0,4.0,5.0,BL851ZTVXUR,Lapras,Standard,BL851ZTVXUR,0,Direct,Email Marketing,2565,KM
133413,,28/02/2015,A little important,1.0,2.0,2.0,31/08/2023 7:41,A little important,5.0,,Very important,3.0,2.0,5.0,4.0,5.0,BL851ZTVXUR,Lapras,Standard,BL851ZTVXUR,0,Direct,Email Marketing,2565,KM
133413,,28/02/2015,A little important,1.0,2.0,2.0,31/08/2023 7:41,A little important,5.0,,Very important,3.0,2.0,5.0,4.0,5.0,BL851ZTVXUR,Lapras,Standard,BL851ZTVXUR,0,Direct,Email Marketing,2565,KM
133414,Female,08/12/1968,Not at all important,1.0,1.0,1.0,31/08/2023 7:41,Very important,5.0,4.0,A little important,2.0,2.0,4.0,2.0,5.0,LB389CUUEZN,Blastoise,,LB389CUUEZN,0,Direct,Company Website,3111,KM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133744,Female,,A little important,1.0,1.0,4.0,31/08/2023 23:43,,2.0,4.0,Very important,5.0,4.0,3.0,5.0,4.0,LB957GHIRBD,Blastoise,Standard,LB957GHIRBD,1,Indirect,Search Engine,1506,KM
133744,Female,,A little important,1.0,1.0,4.0,31/08/2023 23:43,,2.0,4.0,Very important,5.0,4.0,3.0,5.0,4.0,LB957GHIRBD,Blastoise,Standard,LB957GHIRBD,1,Indirect,Search Engine,1506,KM
133745,Male,07/09/1996,,,0.0,1.0,31/08/2023 23:44,,0.0,2.0,Extremely important,2.0,1.0,,1.0,5.0,LB539JAJHXJ,Lapras,Standard,LB539JAJHXJ,0,Direct,Company Website,80,KM
133745,Male,07/09/1996,,,0.0,1.0,31/08/2023 23:44,,0.0,2.0,Extremely important,2.0,1.0,,1.0,5.0,LB539JAJHXJ,Lapras,Standard,LB539JAJHXJ,0,Direct,Company Website,80,KM


<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 04: Different category of Data Variables
    <li><b>Composite Value - Need to split up the Variable further</b><ul>
            <li>Source of Traffic</li>
            <li>Cruise Distance</li>        
        </ul>
    </li>
    <li><b>Continuous Value</b><ul>
            <li>Date of Birth</li>
            <li>Logging</li>
        </ul>
    </li>
    <li><b>Non-numeric Nominal Value</b><ul>
            <li>Gender</li>
            <li>Cruise Name</li>
        </ul>
    </li>
    <li><b>Non-numeric Ordinal Value</b><ul>
            <li>Company Website</li>
            <li>Onboard Dining Service</li>
            <li>Onboard Entertainment</li>
            <li>Ticket Type</li>
            <li>Ext_Intcode_x</li>        
        </ul>
    </li>
    <li><b>Numeric Nominal Value</b><ul>
            <li>Dining</li>
        </ul>
    </li>
    <li><b>Numeric Ordinal Value</b><ul>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Gate location</li>
            <li>Cabin Comfort</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
        </ul>
    </li>
</div>

In [77]:
# Split up composite field Source of Traffic into Category and Source
# Remove Source of Traffic from dataset
cruise_data[['Category', 'Source']] = cruise_data['Source of Traffic'].str.split(' - ', expand=True)
cruise_data.drop('Source of Traffic', axis=1, inplace=True)

In [78]:
# Split up composite field Source of Traffic into Category and Source
# Remove Cruise Distance from dataset
cruise_data[['Distance', 'Unit']] = cruise_data['Cruise Distance'].str.split(' ', expand=True)
cruise_data.drop('Cruise Distance', axis=1, inplace=True)

In [79]:
# List these newly created for data classification
cruise_data[['Category', 'Source','Distance', 'Unit']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 401261 entries, 0 to 133745
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Category  401261 non-null  object
 1   Source    401261 non-null  object
 2   Distance  357800 non-null  object
 3   Unit      357800 non-null  object
dtypes: object(4)
memory usage: 15.3+ MB


In [85]:
cruise_data[['Category', 'Source','Distance', 'Unit']].head()

Unnamed: 0_level_0,Category,Source,Distance,Unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Direct,Company Website,3567,KM
0,Direct,Company Website,3567,KM
0,Direct,Company Website,3567,KM
0,Direct,Company Website,3567,KM
1,Indirect,Social Media,672,KM


<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 05: Update of category of Data Variables
    <li><b>Continuous Value</b><ul>
            <li>Date of Birth</li>
            <li>Logging</li>
            <li>Distance</li>
        </ul>
    </li>
    <li><b>Non-numeric Nominal Value</b><ul>
            <li>Gender</li>
            <li>Cruise Name</li>
            <li>Category</li>
            <li>Source</li>
            <li>Unit</li>
        </ul>
    </li>
    <li><b>Non-numeric Ordinal Value</b><ul>
            <li>Company Website</li>
            <li>Onboard Dining Service</li>
            <li>Onboard Entertainment</li>
            <li>Ticket Type</li>
            <li>Ext_Intcode_x</li>        
        </ul>
    </li>
    <li><b>Numeric Nominal Value</b><ul>
            <li>Dining</li>
        </ul>
    </li>
    <li><b>Numeric Ordinal Value</b><ul>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Gate location</li>
            <li>Cabin Comfort</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
        </ul>
    </li>
</div>

In [91]:
cruise_data["Date of Birth"].head(20)

index
0    05/10/1973
0    05/10/1973
0    05/10/1973
0    05/10/1973
1          None
1          None
1          None
1          None
2    22/07/1998
2    22/07/1998
2    22/07/1998
2    22/07/1998
3    01/05/1970
3    01/05/1970
3    01/05/1970
3    01/05/1970
4    07/01/1960
4    07/01/1960
4    07/01/1960
4    07/01/1960
Name: Date of Birth, dtype: object

In [131]:
cruise_data['Formatted Date of Birth'].info()

<class 'pandas.core.series.Series'>
Index: 401261 entries, 0 to 133745
Series name: Formatted Date of Birth
Non-Null Count   Dtype         
--------------   -----         
342898 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 6.1 MB


In [130]:
# Create a new column call Formatted Date of Birth which convert Date of Birth to Date format
cruise_data['Formatted Date of Birth'] = pd.to_datetime(cruise_data['Date of Birth'], format='%d/%m/%Y', errors="coerce")
# Display sample those records to verify the invalid rows in Date of Birth
cruise_data[(cruise_data['Date of Birth'].notnull()) & (cruise_data['Formatted Date of Birth'].isnull())][['Formatted Date of Birth', 'Date of Birth']]

Unnamed: 0_level_0,Formatted Date of Birth,Date of Birth
index,Unnamed: 1_level_1,Unnamed: 2_level_1
90,NaT,1897-05-13
90,NaT,1897-05-13
90,NaT,1897-05-13
95,NaT,1864-10-10
95,NaT,1864-10-10
...,...,...
133608,NaT,1823-04-07
133608,NaT,1823-04-07
133737,NaT,1889-06-10
133737,NaT,1889-06-10


In [146]:
df_formatted_dob = pd.DataFrame(cruise_data["Formatted Date of Birth"])
print_missing_value(df_formatted_dob)

Column                                       Number (Percentage)
Formatted Date of Birth                      58363 (14.54%)


In [147]:
cruise_data[["Date of Birth","Formatted Date of Birth"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 401261 entries, 0 to 133745
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Date of Birth            357208 non-null  object        
 1   Formatted Date of Birth  342898 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 9.2+ MB


<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 06: Date of Birth
    <li>From the above, we can see that all the invalid Date of Birth are those with YYYY-MM-DD and that the year of these dates are seems to be invalid.</li>
    <li>Therefore I propose a removal of those rows with invalid or empty DOB because: <ul>
        <li>Date of Birth is an important source which plays an important role in a person purchase habit. Without this data, the records will not be helpful in prediction. Any imputation will cause the prediction to be inaccurate.</li>
        <li>The missing and invalid data contributes to 15% of the data</li>
</div>

In [151]:
# Remove Date of Birth with Formatted Date of Birth
cruise_data["Date of Birth"] = cruise_data["Formatted Date of Birth"]
cruise_data = cruise_data.drop(['Formatted Date of Birth'], axis=1)

In [152]:
cruise_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 401261 entries, 0 to 133745
Data columns (total 26 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   Gender                                      360890 non-null  object        
 1   Date of Birth                               342898 non-null  datetime64[ns]
 2   Onboard Wifi Service                        342780 non-null  object        
 3   Embarkation/Disembarkation time convenient  354330 non-null  float64       
 4   Ease of Online booking                      346192 non-null  float64       
 5   Gate location                               349835 non-null  float64       
 6   Logging                                     401261 non-null  object        
 7   Onboard Dining Service                      350830 non-null  object        
 8   Online Check-in                             354193 non-null  float64       
 9 

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 07: Date of Birth
    <li>Ensure that the Date of Birth is replaced successfully</li>
    <li>Its datatype is of DateTime format</li>
</div>

In [122]:
import util as utl
output_csv = utl.output_csv("./data/", cruise_data,"data.csv")

## Data Profiling 

#### This stage I would like to have an idea of combined dataset

In [None]:
# Get a rough idea on the number of rows for the dataframe
dataframe.shape

In [None]:
# Following are the first 5 rows of Pre_cruise as sample to ensure 
dataframe.head(5)

In [None]:
dataframe.info()

In [None]:
dataframe.describe().T

In [None]:
dataframe.isna().sum()

#### From the above, I quickly inspecting the structure and content of a DataFrame
#### Looking at the above huge missing data is my concern. Given that there are 133,746 records, with 20,293 missing data in Baggage handling that is 15.1%.
#### WiFi & Entertainment contribute to 52% of missing data, I suggest not to use these columns during modelling
#### I notice there are 2 Ext_Intcodes from each dataset. If they are the same then drop one.
#### DOB is an object datatype rather than a datetime.
#### Cruise Duration contain both KM and Miles which need to standardise urgently before proceeding into any visualisation which can cause inaccuracy.

## Data Understanding

In [None]:
def plot_hists_from_dataframe(dataframe: pd.core.frame.DataFrame, dependent_features: list):
    """
        This function will produce a histogram of the each of the element found in the dependent_features that resides in
        DataFrame. The significance of this function is to list out the count of every value in dependent_features. From the
        the bar, we can lookout for dirty data and perform data cleansing.
 
        Args:
            dataframe (Dataframe): The dataframe which contain the column to be examined.
            dependent_features (list): The column name to plot the histogram.
 
        Returns:
            NIL.
    """
    for col_name in dependent_features:
        value_counts = dataframe[col_name].value_counts()
        plt.figure(figsize=(10, 5))
        bars = plt.bar(value_counts.index, value_counts.values)
        plt.title(f'Distribution of {col_name}')
        plt.xlabel(col_name)
        plt.ylabel('Count')
        plt.xticks(rotation=90)  
        for bar, count in zip(bars, value_counts.values):
            plt.text(bar.get_x() + bar.get_width() / 2, count, str(count), ha='center', va='bottom')

        plt.tight_layout()
        plt.show()

#### This stage, I like to see the dirty data that existed in each Categorical data and their count

In [None]:
# # Visualise a histogram on the every value in each feature in the list  
# column_names = dataframe.columns.tolist()
# # Omit IDs, working and continuous variables from the list 
# elements_to_remove = ["Date of Birth","Ext_Intcode","Logging","Distance in KM","Ticket Type","Age"] 
# column_names = list(filter(lambda x: x not in elements_to_remove, column_names))
# plot_hists_from_dataframe(dataframe, column_names)

<div style="background-color: #f0f9ff; border-left: 6px solid #0ea5e9; font-size: 100%; padding: 10px;">
    <h3 style="color: #27374D; font-size: 18px; margin-top: 0; margin-bottom: 10px;">📉  Observation: </h3>
    <ul>
        <li>Gender
            <ul>
                <li>non-numeric Binary variable. Need transform for Male to 1 and Female to 0.</li>
                <li>Male is slightly more than Female, I prefer to use random so not to in create distortion.</li>
                <li>If I impute to Male, because it is more than female, it distorts the distribution</li>
            </ul>
        </li>
        <li>Date Of Birth
            <ul>
                <li>This field transforms to Age. Age is a numeric Continuous variable</li>
                <li>For missing age, I use median.</li>
                <li>I consider to use bin to categorise, as the older the passenger is, the better ticket typehe purchase</li>
            </ul>
        </li>
        <li>Source of Traffic
            <ul>
                <li>non-numeric nominal variable - Need to use One Hot Key Encoder</li>
                <li>No missing value.</li>
            </ul>
        </li>
        <li>Onboard Wifi Service
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since majority thought that Wifi is between "A little impt" and "Somewhat impt", if, I impute to mode, which is "A little impt", it should not distort the distribution much.</li>
            </ul>
        </li>
        <li>Embarkation/Disembarkation time convenient
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>                    
                <li>Since slight higher trend towards is very important and extreme impt , I impute missing value with mode.</li>
            </ul>
        </li>       
        <li>Ease of Online booking
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a little impt and somewhat impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Gate location
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a somewhat impt and very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Dining Service
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li> 
        <li>Online Check-in
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cabin Comfort
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Entertainment
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cabin service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Baggage handling
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Port Check-in Service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a somewhat impt and Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and Extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cleanliness
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and somewhat impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cruise Name
            <ul>
                <li>Group blast, blast0ise, blastoise under Blastoise</li>
                <li>Group IAPRAS, lap, lapras under Lapras</li>
                <li>All missing value group under Blastoise since most taking Blastoise</li>
                <li>non-numeric Binary variable. Need transform for Blastoise to 1 and Lapras to 0.</li>
            </ul>
        </li>
        <li>Ticket Type
            <ul>
                <li>Since this is the dependent variable, any missing value should be removed.</li>
                <li>Need to use Label encoder since it is the Dependent variable</li>                    
            </ul>
        </li>
        <li>WiFi, Entertainment
            <ul>
                <li>I like to exclude these features from modelling as there are too many missing values. </li>            
            </ul>
        </li>
        <li>Cruise Distance
            <ul>
                <li>Standard to Distance in KM</li>
                <li>Impute missing to Mean.
                </li>
            </ul>
        </li>        
</div>


In [None]:
def plt_axis_name(x_column,y_column, dataframe):  
    plt.title(f'Relationship between {x_column} and {y_column}')
    plt.xlabel(x_column)
    plt.ylabel(y_column)

In [None]:
def volin_plot(x_column, y_column, dataframe):
    plt.figure(figsize=(8, 6))     
    sns.violinplot(x=x_column, y=y_column, data=dataframe)
    plt_axis_name(x_column,y_column, dataframe)       
    plt.show()

In [None]:
def box_plot(x_column, y_column, dataframe):
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=x_column, y=y_column, data=dataframe)
    plt_axis_name(x_column,y_column, dataframe)
    plt.show()

In [None]:
volin_plot('Distance in KM','Ticket Type', dataframe)

#### From the above volin plot, I can gather that
1. Less people travel using Luxury as compared to Standard and Deluxe.
2. For longer distance, people choose Standard and Deluxe over Luxury. 

In [None]:
box_plot('Age','Ticket Type', dataframe)

#### From the above boxplot, I can gather that
1. As the older you are the more you can afford higher class. This can be seen by the mean of Luxury higher than Deluxe which is in turn higher than Standard. 
2. Some outliners in Luxury class

In [None]:
## Correlation table
corr = round(dataframe.corr(numeric_only=True),3)
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap="coolwarm", mask=mask, square=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
%run ./src/feature_engineering.py
feature_engineering = FeatureEngineer(dataframe)
dataframe = feature_engineering.fix_typo_error()
dataframe = feature_engineering.drop_ID_cols()
dataframe = feature_engineering.convert_features_to_numeric()
dataframe = feature_engineering.process_impute_missing_data()

In [None]:
df_cruise.info()

In [None]:
# Define Class as Target Variable, and the rest as feature variable
X = df_data.drop("class", axis=1)     # everything except 'class' column
y = df_data['class']

# Define the train dataset as 70% and test dataset as 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

# Confirm that the records returned for Train is about 70% and Test is about 30%
print(f"'X' shape: {X_train.shape}")
print(f"'y' shape: {X_test.shape}")