In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


â€‹
<div style="color:#D81F26;
           display:fill;
           border-style: solid;
           border-color:#C1C1C1;
           font-size:14px;
           font-family:Calibri;
           background-color:#373737;">
<h2 style="text-align: center;
           padding: 10px;
           color:#FFFFFF;">
======= Spaceship Titanic =======
</h2>
</div>

<img src="https://i.redd.it/starship-titanic-v0-onkq3aj6w1ja1.jpg?s=6bb053590c16f2d518b7958a5d5ce7c3861d95df" length=400 width=400>

# 1. About this notebook

This notebooks is to show if AutoML can reach an performing model with less time and effort for model selection and hyperparameter tuning.  The most best-performing model algorithm with the most effective hyperparameters is tuned by the autoML algorithm automatically.   

## 1.1. Summary

* Assess the balance of the target label distribution to determine if oversampling is required during the modeling process.
* Identify features that exhibit high cardinality.
* Employ feature engineering techniques for Cabin, Age, and Total Spending.
* Address missing values based on feature types by Imputation.
* Utilize Correlation Analysis to examine interdependent features.
* Manage outliers using the IRQ method and frequency distribution analysis.
* Apply encoding to categorical features and scaling to numerical features.
* Employ the AutoML algorithm FLAML for modeling, which automatically selects the optimal model and hyperparameters.

## 1.2 Data Attributes
* <b>PassengerId</b> - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always. <i>[Text]</i>
* <b>HomePlanet</b> - The planet the passenger departed from, typically their planet of permanent residence. <i>[Categorical]</i>
* <b>CryoSleep</b> - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins. <i>[Boolean]</i>
* <b>Cabin</b> - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard. <i>[Categorical]</i>
* <b>Destination</b> - The planet the passenger will be debarking to. <i>[Categorical]</i>
* <b>Age</b> - The age of the passenger. <i>[Numerical]</i>
* <b>VIP</b> - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities. <i>[Boolean]</i>
* <b>RoomService, FoodCourt, ShoppingMall, Spa, VRDeck</b> - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities. <i>[Numerical]</i>
* <b>Name</b> - The first and last names of the passenger. <i>[Text]</i>
* <b>Transported</b> - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict. <i>[Boolean]</i>


# 2. Setup

## 2.1. Package and Installation 

In [None]:
# Install  the FLAML package
!pip install -q flaml
from flaml import AutoML
from xgboost import XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=2)

import random
from scipy.stats import uniform, randint

# Suppress any warnings
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV
from sklearn import metrics

# Category Encoders
import category_encoders as encoders


## 2.2. Load the Data

In [None]:
# Load the train and test data sets

df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_test_index = df_test['PassengerId'].copy()

print('Size of data set - tran dataset : {}, test dataset : {}'.format(df_train.shape, df_test.shape))

In [None]:
# Preview the first 5 rows of the train and test data sets

print(df_train.head(5))
print('=' * 50)
print(df_test.head(5))

# 3. Exploratory Data Analysis

In [None]:
# List of columns in the data set
df_train.info()

# 3.1. Distribution of Target Label

<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
To check if the distribution of target feature (i.e. Transported) is balanced.  If it is not the case, we will do oversampling in the modelling process.  
</div>

In [None]:
# check whether the data set is balanced

def auto_fmt (pct_value):
    return '{:.0f}\n({:.2f}%)'.format(df_train['Transported'].value_counts().sum()*pct_value/100,pct_value) 

df_transported_count = df_train['Transported'].value_counts().rename_axis('Transported').reset_index(name='Counts')

fig = plt.gcf()
fig.set_size_inches(7,7)
plt.pie(x=df_transported_count['Counts'], labels=df_transported_count['Transported'], autopct=auto_fmt, textprops={'fontsize': 18})
plt.title('Distribution of Target Label (i.e. Transported)',  fontsize = 20)

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: the distribution of target feature between Tranported and Non-transported target classes is in equal proportion.  There is no oversampling for this data set. 
</div>


# 3.2. Remove the Data Columns with Unique Identifiers or with Many Unique Values
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Features in predictive model are used to estimate an asscoiated likelihood of a target (i.e. Transported).  In other words, features shall have relevance to the target label.  If the features are with high Cabin or with many unique values, using them as features in predictve model can result in overfitting and non-predictive to new intances which the trained model has not seen prior.
</div>

In [None]:
# To count no. of distinct values for each columns.

categorical_cols = ['HomePlanet','Destination','Cabin'] # Excluding Boolean features


df_distinct_counts = df_train[categorical_cols].nunique()/df_train[categorical_cols].count()
df_distinct_counts = df_distinct_counts.rename_axis('Feature').reset_index(name='Count')

# Use the bar chart to display % of distinct values for each categorical feature

# set the size of the chart
plt.figure(figsize=(8, 6))

# create a bar chart using Seaborn
sns.barplot(x='Feature', y='Count', data=df_distinct_counts)

# add x and y axes with labels
plt.xlabel('Categorical Feature',  fontsize = 16)
plt.ylabel('Counts of Distinct Value',  fontsize = 16)
plt.title('% of Distinct Value for each Categorical Feature',  fontsize = 20)

# create a function to format the y-axis as percentages
def to_percent(y, position):
    return '{:.0%}'.format(y)

# set the y-axis ticks as percentages
formatter = FuncFormatter(to_percent)
plt.gca().yaxis.set_major_formatter(formatter)

# Set the font size of x-axis labels
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# display the chart
plt.show()


<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: From the bar chart above, the feature "Cabin" has high cardinality. We will further check if insight can be generated by feature engineering. 
</div>


In [None]:
# List out value count statistics for the feature "Cabin"
df_train["Cabin"].value_counts()

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: The first and last character in the Cabin may have predictive power to the target lablel. Let's keep this label for more feature engineering. 
</div>


# 3.3. Feature Engineering for Cabin
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Feature engineering is the process of creating new features or transforming existing features in a dataset to improve the performance of machine learning models. It involves selecting, extracting, or creating relevant features that can enhance the representation and predictive power of the data.
</div>

In [None]:
# Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
df_train[['Deck','Num','Side']] = df_train.Cabin.str.split('/',expand=True)
df_test[['Deck','Num','Side']] = df_test.Cabin.str.split('/',expand=True)

In [None]:
# Count of the distinct occurence for each new features of Deck, Num and Side
print('Distinct Count of Deck : {}'.format(df_train['Deck'].nunique()))
print('Distinct Count of Num : {}'.format(df_train['Num'].nunique()))
print('Distinct Count of Side : {}'.format(df_train['Side'].nunique()))

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: The new features of Deck and Side may be useful for the prediction.  We will check the distribution of new features over the target labe of Transported. 
</div>


In [None]:
# Set the size of the chart
plt.figure(figsize=(6, 5))
plt.legend(fontsize=13)

# Set the chart title, font size of the title, x-axis, and y-axis labels
plt.title('Deck by Transported', fontsize=20)
plt.xlabel('Deck', fontsize=16)
plt.ylabel('Passanger Count', fontsize=16)

plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

# sns.countplot(df_train.Deck,hue=df_train.Transported)
sns.countplot(df_train.Deck,hue=df_train.Transported)

# Create a legend and set the font size
# legend = plt.legend(title='Transported', labels=df_train.Transported, fontsize=20)
# plt.setp(legend.get_texts(), fontsize='16')

In [None]:
# Set the size of the chart
plt.figure(figsize=(6, 5))
plt.legend(fontsize=13)

# Set the chart title, font size of the title, x-axis, and y-axis labels
plt.title('Side by Transported', fontsize=20)
plt.xlabel('Side', fontsize=16)
plt.ylabel('Passanger Count', fontsize=16)

plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

sns.countplot(df_train.Side,hue=df_train.Transported)

# Create a legend and set the font size
# legend = plt.legend(title='Transported', labels=df_train.Transported, fontsize=20)
# plt.setp(legend.get_texts(), fontsize='16')

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: When closely examining the distribution of feature values by the target label, certain feature values exhibit discriminative capabilities towards the target label.
</div>

# 3.4. Missing Value Replacement
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Missing value handling in machine learning refers to the process of addressing and managing data instances where one or more feature values are missing or unknown. Handling missing values is crucial because most machine learning algorithms cannot directly handle missing data
</div>

In [None]:
# To count the no. of rows with missing values for each columns
# print('No. of Rows with Missing Values by Columns (Training):')
# print(df_train.isna().sum())

drop_cols = ['Num', 'Transported', 'Cabin', 'PassengerId', 'Name']

plt.figure(figsize=(10, 6))

# No. of missing values by features
df_train_missing = df_train.drop(columns = drop_cols).isna().sum()

# Create a bar chart
df_train_missing.plot.bar()

# Set the chart title and axis labels
plt.title('Missing Values Count in Train Data Set', fontsize=20)
plt.xlabel('Columns', fontsize=16)
plt.ylabel('Count', fontsize=16)

plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

# Display the chart
plt.show()

print('='*50)
# print('No. of Rows with Missing Values by Columns (Validation):')
# print(df_test.isna().sum())

plt.figure(figsize=(10, 6))

# No. of missing values by features
df_test_missing = df_train.drop(columns = drop_cols).isna().sum()

# Create a bar chart
df_test_missing.plot.bar()

# Set the chart title and axis labels
plt.title('Missing Values Count', fontsize=20)
plt.xlabel('Columns', fontsize=16)
plt.ylabel('Count', fontsize=16)

plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

# Display the chart
plt.show()

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: There are missing values present for each feature. To handle these missing values, we will employ different methods based on the nature of the missing value, the amount and pattern of missing values.
</div>

In [None]:
df_train.set_index('PassengerId',inplace=True)
df_test.set_index('PassengerId',inplace=True)

<div style=" background-color:#006400;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Missing Value Replacement - We will employ different methods for replacing missing values based on the type of feature. Specifically, for numerical features, we will utilize the mean and median as replacements, while for categorical features, we will use the mode.
</div>

In [None]:
# df_train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = df_train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)
# df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)

df_train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] \
    = df_train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(df_train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].median())
df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] \
    = df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].median())

# df_train['Age'] =df_train['Age'].fillna(df_train['Age'].median())
# df_test['Age'] =df_test['Age'].fillna(df_test['Age'].median())

df_train['Age'] =df_train['Age'].fillna(df_train['Age'].mean())
df_test['Age'] =df_test['Age'].fillna(df_test['Age'].mean())

df_train['VIP'] =df_train['VIP'].fillna(False)
df_test['VIP'] =df_test['VIP'].fillna(False)

# df_train['HomePlanet'] =df_train['HomePlanet'].fillna('XXXXXX')
# df_test['HomePlanet'] =df_test['HomePlanet'].fillna('XXXXXX')

df_train['HomePlanet'] =df_train['HomePlanet'].fillna(df_train['HomePlanet'].mode()[0])
df_test['HomePlanet'] =df_test['HomePlanet'].fillna(df_train['HomePlanet'].mode()[0])

# df_train['Destination']=df_train['Destination'].fillna("XXXXXX")
# df_test['Destination']=df_test['Destination'].fillna("XXXXXX")

df_train['Destination']=df_train['Destination'].fillna(df_train['Destination'].mode()[0])
df_test['Destination']=df_test['Destination'].fillna(df_train['Destination'].mode()[0])

df_train['CryoSleep'] =df_train['CryoSleep'].fillna(False)
df_test['CryoSleep'] =df_test['CryoSleep'].fillna(False)

# df_train['Cabin'] =df_train['Cabin'].fillna('XXXXXX')
# df_test['Cabin'] =df_test['Cabin'].fillna('XXXXXX')

df_train['Deck'] =df_train['Deck'].fillna(df_train['Deck'].mode()[0])
df_test['Deck'] =df_test['Deck'].fillna(df_train['Deck'].mode()[0])

df_train['Side'] =df_train['Side'].fillna(df_train['Side'].mode()[0])
df_test['Side'] =df_test['Side'].fillna(df_train['Side'].mode()[0])

In [None]:
# To count the no. of rows with missing values for each columns
print('No. of Rows with Missing Values by Columns (Training):')
print(df_train.isna().sum())
print('No. of Rows with Missing Values by Columns (Testing):')
print(df_test.isna().sum())

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: Missing values for key features required for modeling have been imputed.
</div>

# 3.5. Feature Engineering for Total Spending and Age Groups
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Feature engineering is the process of creating new features or transforming existing features in a dataset to improve the performance of machine learning models. It involves selecting, extracting, or creating relevant features that can enhance the representation and predictive power of the data.
</div>

<div style=" background-color:#006400;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Total Spending from sum of Room Service, Foot Court, Shopping Mall, Spa and VR Deck
</div>

In [None]:
df_train['total_spent']= df_train['RoomService']+ df_train['FoodCourt']+ df_train['ShoppingMall']+ df_train['Spa']+ df_train['VRDeck']
df_test['total_spent']=df_test['RoomService']+df_test['FoodCourt']+df_test['ShoppingMall']+df_test['Spa']+df_test['VRDeck']

plt.figure(figsize=(10, 6))

# Show histogram with bins of 25 using histplot
sns.histplot(df_train['total_spent'], bins=25)

# Set the chart title and axis labels
plt.title('Histogram of Total Spending', fontsize = 20)
plt.xlabel('Total Spending', fontsize=16)
plt.ylabel('Frequency Count', fontsize=16)


plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

# Display the chart
plt.show()


<div style=" background-color:#006400;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
To convert the Age feature into discrete categories
</div>

In [None]:
df_train['AgeGroup'] = 0
for i in range(6):
    df_train.loc[(df_train.Age >= 10*i) & (df_train.Age < 10*(i + 1)), 'AgeGroup'] = i
# Same for test data
df_test['AgeGroup'] = 0
for i in range(6):
    df_test.loc[(df_test.Age >= 10*i) & (df_test.Age < 10*(i + 1)), 'AgeGroup'] = i

In [None]:
# Set the size of the chart
plt.figure(figsize=(10, 8))
plt.legend(fontsize=13)

# Set the chart title, font size of the title, x-axis, and y-axis labels
plt.title('Age Group by Transported', fontsize=20)
plt.xlabel('Age Group', fontsize=16)
plt.ylabel('Passanger Count', fontsize=16)

plt.tick_params(axis='x', which='major', labelsize=14)
plt.tick_params(axis='y', which='major', labelsize=14)

sns.countplot(df_train['AgeGroup'],hue=df_train.Transported)

# 3.6. Coorelation Analysis
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Correlation Analysis helps to determine the degree and direction of association between variables, providing insights into their linear dependence.
</div>

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df_train.corr(), annot=True, annot_kws={'fontsize': 16})
plt.title('Correlation Analysis for Numerical Features')

<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: Based on the correlation analysis conducted,the majority of variable pairs exhibit correlations within the range of +0.2 to -0.2. However, an exception is observed in the correlation between the variables "Transported" and "CryoSleep," which demonstrates a correlation beyond this range. Nevertheless, it is to note that this correlation exhibits a moderate dependence between these two variables.
</div>

# 3.7. Frequency Distribution for Categorical Features
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
The count chart of categorical features involves analyzing and visualizing the frequency distribution of each category within a categorical feature. It provides insights into the distribution and relative frequencies of different categories, allowing us to understand the composition of the data.
</div>

In [None]:
categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'Deck', 'Side', 'AgeGroup']

# Count Chart for Categorical Features
fig, axs = plt.subplots(4, 2, figsize=(16,24))

# Set the font size of axis labels and tick labels
for ax in axs.flatten():
    ax.tick_params(axis='both', which='both', labelsize=12)
    # Set the size of y-label and x-label
    ax.set_ylabel('Y-Label', fontsize=16)
    ax.set_xlabel('X-Label', fontsize=16)
         
sns.countplot(df_train['HomePlanet'],hue=df_train.Transported, ax=axs[0][0])
sns.countplot(df_train['Destination'],hue=df_train.Transported, ax=axs[0][1])
sns.countplot(df_train['VIP'],hue=df_train.Transported, ax=axs[1][0])
sns.countplot(df_train['Deck'],hue=df_train.Transported, ax=axs[1][1])
sns.countplot(df_train['Side'],hue=df_train.Transported, ax=axs[2][0])
sns.countplot(df_train['AgeGroup'],hue=df_train.Transported, ax=axs[2][1])
sns.countplot(df_train['CryoSleep'],hue=df_train.Transported, ax=axs[3][0])


plt.show()


<div style=" background-color:#b22222;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Observation: Based on the distribution analysis, it appears that the features VIP and Side demonstrate lower discriminatory power towards the target label.
</div>

# 3.8. Outlier Analysis and Histogram for Numerical Features
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
Histogram distribution provides a visual representation of the distribution of a numerical feature. They help us understand the spread, central tendency, and shape of the data. By examining the histogram, we can identify patterns, outliers, and potential data issues, gaining insights into the data's characteristics.
<br><br>
    
The IRQ (Interquartile Range) method will be used for outlier handling in data analysis. It involves identifying and treating outliers based on the spread of the data as measured by the interquartile range. The interquartile range is a statistical measure that represents the range between the first quartile (25th percentile) and the third quartile (75th percentile) of a dataset. It captures the middle 50% of the data, excluding the most extreme values.
</div>

In [None]:
# Set the IRQ method calculation
numerical_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'total_spent']
IRQ_dict = {}

for col in numerical_cols:
        p75 = df_train[df_train[col] > 0][col].quantile(0.75)
        p25 = df_train[df_train[col] > 0][col].quantile(0.25)
        iqr = p75 - p25
        upper_limit = p75 + (1.5 * iqr)
        IRQ_dict[col] = upper_limit
        print('===={} with Upper Limit {:6.1f}, P75 {:6.1f}, P25 {:6.1f}, {} Outlier Records ========'.format(col, upper_limit, p75, p25, df_train[df_train[col] > upper_limit]['Transported'].count()))
#         df_train[col] = np.where (df_train[col] > upper_limit, upper_limit, df_train[col])
        df_train[col] = df_train[col].apply(lambda x: upper_limit if x > upper_limit else x)
        df_test[col] = df_test[col].apply(lambda x: upper_limit if x > upper_limit else x)


In [None]:
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'total_spent']

# Count Chart for Categorical Features
fig, axs = plt.subplots(4, 2, figsize=(16,24))

# Set the font size of axis labels and tick labels
for ax in axs.flatten():
    ax.tick_params(axis='both', which='both', labelsize=12)
    ax.set_ylabel('Count', fontsize=16)

subplot = sns.histplot(data=df_train, x='Age', bins=25, hue='Transported', ax=axs[0][0])     
subplot.set_xlabel('Age', fontsize=16)
# subplot.set_title('Histogram for Age', fontsize=20)

subplot = sns.histplot(data=df_train, x='RoomService', bins=25, hue='Transported', ax=axs[0][1])     
subplot.set_xlabel('RoomService', fontsize=16)
# subplot.set_title('Histogram for RoomService', fontsize=20)

subplot = sns.histplot(data=df_train, x='FoodCourt', bins=25, hue='Transported', ax=axs[1][0])     
subplot.set_xlabel('FoodCourt', fontsize=16)
# subplot.set_title('Histogram for FoodCourt', fontsize=20)

subplot = sns.histplot(data=df_train, x='ShoppingMall', bins=25, hue='Transported', ax=axs[1][1])     
subplot.set_xlabel('ShoppingMall', fontsize=16)
# subplot.set_title('Histogram for ShoppingMall', fontsize=20)

subplot = sns.histplot(data=df_train, x='Spa', bins=25, hue='Transported', ax=axs[2][0])     
subplot.set_xlabel('Spa', fontsize=16)
# subplot.set_title('Histogram for Spa', fontsize=20)

subplot = sns.histplot(data=df_train, x='VRDeck', bins=25, hue='Transported', ax=axs[2][1])     
subplot.set_xlabel('VRDeck', fontsize=16)
# subplot.set_title('Histogram for VRDeck', fontsize=20)

subplot = sns.histplot(data=df_train, x='total_spent', bins=25, hue='Transported', ax=axs[3][0])     
subplot.set_xlabel('total_spent', fontsize=16)
# subplot.set_title('Histogram for total_spent', fontsize=20)

plt.show()


# 3.9. CatBoostEncoding for Categorical Features
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
CATBoostEncode, or CatBoostEncoder, is a categorical encoding technique specifically designed for the CatBoost algorithm. CatBoost is a gradient boosting algorithm that is known for its strong performance in handling categorical features. The CatBoostEncoder is a specialized encoding method that is compatible with the CatBoost algorithm and aims to effectively encode categorical variables for improved model performance.
</div>

In [None]:
from sklearn.preprocessing import LabelEncoder

import category_encoders as encoders
CATBoostENCODE = encoders.CatBoostEncoder()

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'Deck', 'Side', 'AgeGroup']

# Use CatBoost to encode the categorical values
encoder_train = CATBoostENCODE.fit_transform(df_train[categorical_cols], df_train['Transported'])
encoded_df_train = pd.DataFrame(encoder_train)
print(encoded_df_train.head(5))

encoder_test = CATBoostENCODE.transform(df_test[categorical_cols])
encoded_df_test = pd.DataFrame(encoder_test)
print(encoded_df_test.head(5))



# 3.10. Robust Scaler
<div style=" background-color:#4b371c;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
The RobustScaler is a data preprocessing technique available in the scikit-learn library in Python. It is used to scale numerical data in a robust manner, meaning it is less sensitive to the presence of outliers compared to other scaling methods like standardization or min-max scaling.
</div>

In [None]:
from sklearn.preprocessing import RobustScaler

numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'total_spent']

# Create a RobustScaler object
scaler = RobustScaler()

# Extract the index
index = df_train.index

# Fit the scaler to the data and transform it
scaler_train = scaler.fit_transform(df_train[numerical_cols])
scaler_df_train = pd.DataFrame(scaler_train, columns=numerical_cols)
# Reassign the index to the scaled DataFrame
scaler_df_train.index = index
print(scaler_df_train.head(5))

# Extract the index
index = df_test.index

scaler_test = scaler.transform(df_test[numerical_cols])
scaler_df_test = pd.DataFrame(scaler_test, columns=numerical_cols)
# Reassign the index to the scaled DataFrame
scaler_df_test.index = index
print(scaler_df_test.head(5))

In [None]:
drop_cols = categorical_cols + numerical_cols

df_train_2 = df_train.drop(columns=drop_cols).copy()
df_train_2 =pd.concat([df_train_2, encoded_df_train, scaler_df_train], axis=1)


df_test_2 = df_test.drop(columns=drop_cols).copy()
df_test_2 =pd.concat([df_test_2, encoded_df_test, scaler_df_test], axis=1)

# 4. Modeling

<div style=" background-color:#006400;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
To prepare the training data set
</div>

In [None]:
# to drop categorical features with high cardinality and also the numerical feature "Age"

df_train_2= df_train_2.drop(['Num', 'Cabin', 'Name', 'AgeGroup'],axis=1)
df_test_2= df_test_2.drop(['Num', 'Cabin', 'Name', 'AgeGroup'],axis=1)

print(df_train_2.columns)
print(df_test_2.columns)

In [None]:
df_train_2['Transported']=df_train_2['Transported'].replace({True:1,False:0})

In [None]:
# Convert the target label into Boolean feature

X=df_train_2.drop('Transported',axis=1)
y = df_train_2['Transported']

X.columns

<div style=" background-color:#006400;text-align:left; padding: 13px 13px; border-radius: 8px; color: white; font-size: 16px">
By splitting the dataset into separate training and test sets, we can train the model on the training set and evaluate its performance on the test set.
</div>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# 4.1. XGBoost


In [None]:
# Parameter Setup
p_folds = 3
p_iter = 100
p_estimators = 1000
p_learning_rate = 0.001

# A parameter grid for XGBoost
params = {
        'max_depth': randint(7, 12),
        'gamma': uniform(0.0, 0.5),
        'subsample': uniform(0.6, 1.0),
        'colsample_bytree': uniform(0.6, 1.0),
        'reg_alpha': uniform(0.0, 1.0),
        'reg_lambda': uniform(0.0, 1.0),
        'min_child_weight': randint(3, 7),
        'scale_pos_weight': randint(1, 10)     
        }

In [None]:
xgb = XGBClassifier(learning_rate=p_learning_rate, n_estimators = p_estimators, objective='binary:logistic')

In [None]:
# folds = p_folds

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

xg_model = RandomizedSearchCV(xgb, param_distributions=params, n_iter=p_iter, scoring='roc_auc', n_jobs=1, verbose=-1, random_state=1001 )

xg_model.fit(X_train, y_train)

In [None]:
print(xg_model.best_estimator_)
print(xg_model.best_params_)

In [None]:
# Retrieve the best estimator and build the optimal model for analysis of Global Importance
best_xgb =XGBClassifier(**xg_model.best_estimator_.get_params())
best_xgb.fit(X_train,y_train)
accuracy = best_xgb.score(X_test, y_test)
print('Accuracy of XGBoost : {}'.format(accuracy))

# 4.2. Model Performance


In [None]:
p_train = best_xgb.predict(X_train)
p_test = best_xgb.predict(X_test)
p_train_proba = best_xgb.predict_proba(X_train)[:,1]
p_test_proba = best_xgb.predict_proba(X_test)[:,1]

In [None]:
predicted_test = pd.DataFrame(p_test)
predicted_train = pd.DataFrame(p_train)
print('=============================================')
print('Scoring Metrics for XGBoost (Validation)')
print('=============================================')
print('Balanced Accuracy Score = {}'.format(metrics.balanced_accuracy_score(y_test, predicted_test)))
print('Accuracy Score = {}'.format(metrics.accuracy_score(y_test, predicted_test)))
print('Precision Score = {}'.format(metrics.precision_score(y_test, predicted_test)))
print('F1 Score = {}'.format(metrics.f1_score(y_test, predicted_test, labels=['0','1'])))
print('Recall Score = {}'.format(metrics.recall_score(y_test, predicted_test, labels=['0','1'])))
print('ROC AUC Score = {}'.format(metrics.roc_auc_score(y_test, predicted_test, labels=['0','1'])))
print('Confusion Matrix')
print('==================')
print(metrics.confusion_matrix(y_test, predicted_test))
print('==================')
print(metrics.classification_report(y_test, predicted_test, target_names=['0','1']))
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(y_test, predicted_test)).plot()


print('=============================================')
print('Scoring Metrics for XGBoost (Training)')
print('=============================================')
print('Balanced Accuracy Score = {}'.format(metrics.balanced_accuracy_score(y_train, predicted_train)))
print('Accuracy Score = {}'.format(metrics.accuracy_score(y_train, predicted_train)))
print('Precision Score = {}'.format(metrics.precision_score(y_train, predicted_train)))
print('F1 Score = {}'.format(metrics.f1_score(y_train, predicted_train)))
print('Recall Score = {}'.format(metrics.recall_score(y_train, predicted_train, labels=['0','1'])))
print('ROC AUC Score = {}'.format(metrics.roc_auc_score(y_train, predicted_train, labels=['0','1'])))
print('Confusion Matrix')
print('==================')
print(metrics.confusion_matrix(y_train, predicted_train))
print('==================')
print(metrics.classification_report(y_train, predicted_train, target_names=['0','1']))
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(y_train, predicted_train)).plot()




print('======= ROC Curve =======')


fpr_train, tpr_train, _ = metrics.roc_curve(y_train, p_train_proba)
fpr_test, tpr_test, _ = metrics.roc_curve(y_test, p_test_proba)

roc_auc_train = metrics.roc_auc_score(y_train, predicted_train, labels=['0','1'])
roc_auc_test = metrics.roc_auc_score(y_test, predicted_test, labels=['0','1'])

fig, ax = plt.subplots(1, 1, figsize=(10, 10))    
plt.plot(fpr_test, tpr_test, color='darkorange', label='ROC curve - Validation (area = %0.3f)' % roc_auc_test)
plt.plot(fpr_train, tpr_train, color='darkblue', label='ROC curve - Training (area = %0.3f)' % roc_auc_train)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")


plt.show()

# 4.3. Submission

In [None]:
y_pred = best_xgb.predict(df_test_2)

sub=pd.DataFrame({'Transported':y_pred.astype(bool)})
sub = pd.concat([df_test_index, sub], axis=1)
sub.head()

In [None]:
sub.to_csv('/kaggle/working/submission.csv',index=False)
print('Submission Done!')