# Libraries 📖

In [1]:
import yaml

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score


In [None]:
%run ./src/data_preprocessing.py

# Load data 📁

In [None]:
with open('./config.yaml', 'r') as yaml_file:
    config_data = yaml.safe_load(yaml_file)

In [None]:
dp = DataPreprocessing()
dp.load_data(config_data['yaml_path'], config_data['databases'])
cruise_data = dp.get_merged_data()

In [None]:
TARGET_VARIABLE = config_data['target_variable']
TEST_SIZE=0.25
RANDOM_STATE=42

# Exploratory Data Analysis 📊

### Exploratory on Overall Shape of Dataset

In [None]:
cruise_data.shape

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 01:
    <li>There are a relative big dataset of 401261 rows with 25 columns<ul>
</div>


In [None]:
cruise_data.info()

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 02:
    <li>With reference to above and that of Observation 01 of 401261 rows, there are missing values in the following variables. These missing values needed to be imputed.
        <ul>
            <li>Gender</li>
            <li>Date of Birth</li>
            <li>Onboard Wifi Service</li>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Ease of Online booking</li>
            <li>Gate location</li>
            <li>Onboard Dining Service</li>
            <li>Online Check-in</li>
            <li>Cabin Comfort</li>
            <li>Onboard Entertainment</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
            <li>Cruise Name</li>
            <li>Ticket Type</li>
            <li>Cruise Distance</li>
            <li>WiFi</li>
            <li>Entertainment</li>
        </ul>
    </li>
</div>

In [None]:
def print_missing_value(dataframe):
    print('Column' +' '*(45 -len("Column")) + "Number (Percentage)")
    for column in dataframe.columns:
        missing_values_count = dataframe[column].isnull().sum()
        missing_value_percent = missing_values_count/dataframe.shape[0] * 100
        if missing_values_count !=0:
            print('{}'.format(column) +' '*(45 -len(column)) + "{} ({:.2f}%)".format(missing_values_count, missing_value_percent))    

In [None]:
# I like to know the percentage of missing data over the total number of rows
print_missing_value(cruise_data)

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 03:
    <li>With reference to above
        <ul>
            <li>Most of the missing values ranging upon 10 above percent</li>
            <li>There are 2 columns (WiFi and Entertainment) more than 40%, I will be removing them.</li>
        </ul>
    </li>
</div>

In [None]:
# Removal of Wifi and Entertainment
cruise_data = cruise_data.drop(['WiFi', 'Entertainment'], axis=1)

In [None]:
# import util as utl
# output_csv = utl.output_csv("./data/", cruise_data,"data.csv")

In [None]:
pd.set_option('display.max_columns', None)
cruise_data.head(5)

In [None]:
cruise_data.tail(1000)

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 04: Different category of Data Variables
    <li><b>Composite Value - Need to split up the Variable further</b><ul>
            <li>Source of Traffic</li>
            <li>Cruise Distance</li>        
        </ul>
    </li>
    <li><b>Continuous Value</b><ul>
            <li>Date of Birth</li>
            <li>Logging</li>
        </ul>
    </li>
    <li><b>Non-numeric Nominal Value</b><ul>
            <li>Gender</li>
            <li>Cruise Name</li>
        </ul>
    </li>
    <li><b>Non-numeric Ordinal Value</b><ul>
            <li>Company Website</li>
            <li>Onboard Dining Service</li>
            <li>Onboard Entertainment</li>
            <li>Ticket Type</li>
            <li>Ext_Intcode_x</li>        
        </ul>
    </li>
    <li><b>Numeric Nominal Value</b><ul>
            <li>Dining</li>
        </ul>
    </li>
    <li><b>Numeric Ordinal Value</b><ul>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Gate location</li>
            <li>Cabin Comfort</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
        </ul>
    </li>
</div>

In [None]:
# Split up composite field Source of Traffic into Category and Source
# Remove Source of Traffic from dataset
cruise_data[['Category', 'Source']] = cruise_data['Source of Traffic'].str.split(' - ', expand=True)
cruise_data.drop('Source of Traffic', axis=1, inplace=True)

In [None]:
# Split up composite field Source of Traffic into Category and Source
# Remove Cruise Distance from dataset
cruise_data[['Distance', 'Unit']] = cruise_data['Cruise Distance'].str.split(' ', expand=True)
cruise_data.drop('Cruise Distance', axis=1, inplace=True)

In [None]:
# List these newly created for data classification
cruise_data[['Category', 'Source','Distance', 'Unit']].info()

In [None]:
cruise_data[['Category', 'Source','Distance', 'Unit']].head()

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 05: Update of category of Data Variables
    <li><b>Continuous Value</b><ul>
            <li>Date of Birth</li>
            <li>Logging</li>
            <li>Distance</li>
        </ul>
    </li>
    <li><b>Non-numeric Nominal Value</b><ul>
            <li>Gender</li>
            <li>Cruise Name</li>
            <li>Category</li>
            <li>Source</li>
            <li>Unit</li>
        </ul>
    </li>
    <li><b>Non-numeric Ordinal Value</b><ul>
            <li>Company Website</li>
            <li>Onboard Dining Service</li>
            <li>Onboard Entertainment</li>
            <li>Ticket Type</li>
            <li>Ext_Intcode_x</li>        
        </ul>
    </li>
    <li><b>Numeric Nominal Value</b><ul>
            <li>Dining</li>
        </ul>
    </li>
    <li><b>Numeric Ordinal Value</b><ul>
            <li>Embarkation/Disembarkation time convenient</li>
            <li>Gate location</li>
            <li>Cabin Comfort</li>
            <li>Cabin service</li>
            <li>Baggage handling</li>
            <li>Port Check-in Service</li>
            <li>Onboard Service</li>
            <li>Cleanliness</li>
        </ul>
    </li>
</div>

In [None]:
cruise_data["Date of Birth"].head(20)

In [None]:
cruise_data['Formatted Date of Birth'].info()

In [None]:
# Create a new column call Formatted Date of Birth which convert Date of Birth to Date format
cruise_data['Formatted Date of Birth'] = pd.to_datetime(cruise_data['Date of Birth'], format='%d/%m/%Y', errors="coerce")
# Display sample those records to verify the invalid rows in Date of Birth
cruise_data[(cruise_data['Date of Birth'].notnull()) & (cruise_data['Formatted Date of Birth'].isnull())][['Formatted Date of Birth', 'Date of Birth']]

In [None]:
df_formatted_dob = pd.DataFrame(cruise_data["Formatted Date of Birth"])
print_missing_value(df_formatted_dob)

In [None]:
cruise_data[["Date of Birth","Formatted Date of Birth"]].info()

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 06: Date of Birth
    <li>From the above, we can see that all the invalid Date of Birth are those with YYYY-MM-DD and that the year of these dates are seems to be invalid.</li>
    <li>Therefore I propose a removal of those rows with invalid or empty DOB because: <ul>
        <li>Date of Birth is an important source which plays an important role in a person purchase habit. Without this data, the records will not be helpful in prediction. Any imputation will cause the prediction to be inaccurate.</li>
        <li>The missing and invalid data contributes to 15% of the data</li>
</div>

In [None]:
# Remove Date of Birth with Formatted Date of Birth
cruise_data["Date of Birth"] = cruise_data["Formatted Date of Birth"]
cruise_data = cruise_data.drop(['Formatted Date of Birth'], axis=1)

In [None]:
cruise_data.info()

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"> Observation 07: Date of Birth
    <li>Ensure that the Date of Birth is replaced successfully</li>
    <li>Its datatype is of DateTime format</li>
</div>

In [None]:
import util as utl
output_csv = utl.output_csv("./data/", cruise_data,"data.csv")

## Data Profiling 

#### This stage I would like to have an idea of combined dataset

In [None]:
# Get a rough idea on the number of rows for the dataframe
dataframe.shape

In [None]:
# Following are the first 5 rows of Pre_cruise as sample to ensure 
dataframe.head(5)

In [None]:
dataframe.info()

In [None]:
dataframe.describe().T

In [None]:
dataframe.isna().sum()

#### From the above, I quickly inspecting the structure and content of a DataFrame
#### Looking at the above huge missing data is my concern. Given that there are 133,746 records, with 20,293 missing data in Baggage handling that is 15.1%.
#### WiFi & Entertainment contribute to 52% of missing data, I suggest not to use these columns during modelling
#### I notice there are 2 Ext_Intcodes from each dataset. If they are the same then drop one.
#### DOB is an object datatype rather than a datetime.
#### Cruise Duration contain both KM and Miles which need to standardise urgently before proceeding into any visualisation which can cause inaccuracy.

## Data Understanding

In [None]:
def plot_hists_from_dataframe(dataframe: pd.core.frame.DataFrame, dependent_features: list):
    """
        This function will produce a histogram of the each of the element found in the dependent_features that resides in
        DataFrame. The significance of this function is to list out the count of every value in dependent_features. From the
        the bar, we can lookout for dirty data and perform data cleansing.
 
        Args:
            dataframe (Dataframe): The dataframe which contain the column to be examined.
            dependent_features (list): The column name to plot the histogram.
 
        Returns:
            NIL.
    """
    for col_name in dependent_features:
        value_counts = dataframe[col_name].value_counts()
        plt.figure(figsize=(10, 5))
        bars = plt.bar(value_counts.index, value_counts.values)
        plt.title(f'Distribution of {col_name}')
        plt.xlabel(col_name)
        plt.ylabel('Count')
        plt.xticks(rotation=90)  
        for bar, count in zip(bars, value_counts.values):
            plt.text(bar.get_x() + bar.get_width() / 2, count, str(count), ha='center', va='bottom')

        plt.tight_layout()
        plt.show()

#### This stage, I like to see the dirty data that existed in each Categorical data and their count

In [None]:
# # Visualise a histogram on the every value in each feature in the list  
# column_names = dataframe.columns.tolist()
# # Omit IDs, working and continuous variables from the list 
# elements_to_remove = ["Date of Birth","Ext_Intcode","Logging","Distance in KM","Ticket Type","Age"] 
# column_names = list(filter(lambda x: x not in elements_to_remove, column_names))
# plot_hists_from_dataframe(dataframe, column_names)

<div style="background-color: #f0f9ff; border-left: 6px solid #0ea5e9; font-size: 100%; padding: 10px;">
    <h3 style="color: #27374D; font-size: 18px; margin-top: 0; margin-bottom: 10px;">📉  Observation: </h3>
    <ul>
        <li>Gender
            <ul>
                <li>non-numeric Binary variable. Need transform for Male to 1 and Female to 0.</li>
                <li>Male is slightly more than Female, I prefer to use random so not to in create distortion.</li>
                <li>If I impute to Male, because it is more than female, it distorts the distribution</li>
            </ul>
        </li>
        <li>Date Of Birth
            <ul>
                <li>This field transforms to Age. Age is a numeric Continuous variable</li>
                <li>For missing age, I use median.</li>
                <li>I consider to use bin to categorise, as the older the passenger is, the better ticket typehe purchase</li>
            </ul>
        </li>
        <li>Source of Traffic
            <ul>
                <li>non-numeric nominal variable - Need to use One Hot Key Encoder</li>
                <li>No missing value.</li>
            </ul>
        </li>
        <li>Onboard Wifi Service
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since majority thought that Wifi is between "A little impt" and "Somewhat impt", if, I impute to mode, which is "A little impt", it should not distort the distribution much.</li>
            </ul>
        </li>
        <li>Embarkation/Disembarkation time convenient
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>                    
                <li>Since slight higher trend towards is very important and extreme impt , I impute missing value with mode.</li>
            </ul>
        </li>       
        <li>Ease of Online booking
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a little impt and somewhat impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Gate location
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a somewhat impt and very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Dining Service
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li> 
        <li>Online Check-in
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cabin Comfort
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Entertainment
            <ul>
                <li>This is an non-numeric ordinal variable. So need to use ordinal Encoder.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cabin service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Baggage handling
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Port Check-in Service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a somewhat impt and Very impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Onboard Service
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and Extremely impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cleanliness
            <ul>
                <li>This is an numeric ordinal variable. No further action required.</li>
                <li>Since slight higher trend towards is a Very impt and somewhat impt, I impute missing value with mode.</li>
            </ul>
        </li>
        <li>Cruise Name
            <ul>
                <li>Group blast, blast0ise, blastoise under Blastoise</li>
                <li>Group IAPRAS, lap, lapras under Lapras</li>
                <li>All missing value group under Blastoise since most taking Blastoise</li>
                <li>non-numeric Binary variable. Need transform for Blastoise to 1 and Lapras to 0.</li>
            </ul>
        </li>
        <li>Ticket Type
            <ul>
                <li>Since this is the dependent variable, any missing value should be removed.</li>
                <li>Need to use Label encoder since it is the Dependent variable</li>                    
            </ul>
        </li>
        <li>WiFi, Entertainment
            <ul>
                <li>I like to exclude these features from modelling as there are too many missing values. </li>            
            </ul>
        </li>
        <li>Cruise Distance
            <ul>
                <li>Standard to Distance in KM</li>
                <li>Impute missing to Mean.
                </li>
            </ul>
        </li>        
</div>


In [None]:
def plt_axis_name(x_column,y_column, dataframe):  
    plt.title(f'Relationship between {x_column} and {y_column}')
    plt.xlabel(x_column)
    plt.ylabel(y_column)

In [None]:
def volin_plot(x_column, y_column, dataframe):
    plt.figure(figsize=(8, 6))     
    sns.violinplot(x=x_column, y=y_column, data=dataframe)
    plt_axis_name(x_column,y_column, dataframe)       
    plt.show()

In [None]:
def box_plot(x_column, y_column, dataframe):
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=x_column, y=y_column, data=dataframe)
    plt_axis_name(x_column,y_column, dataframe)
    plt.show()

In [None]:
volin_plot('Distance in KM','Ticket Type', dataframe)

#### From the above volin plot, I can gather that
1. Less people travel using Luxury as compared to Standard and Deluxe.
2. For longer distance, people choose Standard and Deluxe over Luxury. 

In [None]:
box_plot('Age','Ticket Type', dataframe)

#### From the above boxplot, I can gather that
1. As the older you are the more you can afford higher class. This can be seen by the mean of Luxury higher than Deluxe which is in turn higher than Standard. 
2. Some outliners in Luxury class

In [None]:
## Correlation table
corr = round(dataframe.corr(numeric_only=True),3)
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap="coolwarm", mask=mask, square=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
%run ./src/feature_engineering.py
feature_engineering = FeatureEngineer(dataframe)
dataframe = feature_engineering.fix_typo_error()
dataframe = feature_engineering.drop_ID_cols()
dataframe = feature_engineering.convert_features_to_numeric()
dataframe = feature_engineering.process_impute_missing_data()

In [None]:
df_cruise.info()

In [None]:
# Define Class as Target Variable, and the rest as feature variable
X = df_data.drop("class", axis=1)     # everything except 'class' column
y = df_data['class']

# Define the train dataset as 70% and test dataset as 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

# Confirm that the records returned for Train is about 70% and Test is about 30%
print(f"'X' shape: {X_train.shape}")
print(f"'y' shape: {X_test.shape}")