In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.rows",None)
pd.set_option("display.max.columns",None)


from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

from imblearn.over_sampling import SMOTE

## Loading Dataset

In [None]:
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

## == Data Description ==

* PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
* HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
* CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
* Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
* Destination - The planet the passenger will be debarking to.
* Age - The age of the passenger.
* VIP - Whether the passenger has paid for special VIP service during the voyage.
* RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
* Name - The first and last names of the passenger.
* Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Basic Data Understanding 

In [None]:
# First thing to check --> Dimension of Data

# Training dataset shape:
print("Training dataset shape is: ", train_df.shape)

# Testing dataset shape:
print("Testing dataset shape is: ", test_df.shape)

In [None]:
# Second thing to check --> Heads of training and testing data

In [None]:
train_df.head()

In [None]:
test_df.head()

#### **Observation**
* Testing dataset does not have 'Transported feature' but it does exist in Training dataset

#### **Insights**
* We have to build a model using training data to make predictions for our testing data

In [None]:
# Third thing to check --> Duplicated data

In [None]:
# Duplicates that exist in training data:
print(f"Duplicates in Train dataset is:{train_df.duplicated().sum()}), ({100*train_df.duplicated().sum()/len(train_df)})%")

In [None]:
# Duplicates that exist in testing data:
print(f"Duplicates in Train dataset is:{test_df.duplicated().sum()}), ({100*test_df.duplicated().sum()/len(test_df)})%")

#### **Observation**
* We observe that we don't have any duplicate values in both our training&testing dataset

#### **Insights**
* We do not need to worry about data leakage in our dataset

In [None]:
# Fourth thing to check --> Data-types of Training & Testing data

In [None]:
# Data type in training data
print("Data types of features of Training Data is:")
print(train_df.dtypes)

In [None]:
# Data type in testing data
print("Data types of features of Testing Data is:")
print(test_df.dtypes)

#### **Observation**
* We observe that CryoSleep & VIP features contains boolean values but their data type is object

#### **Insights**
* We need to convert their data-types to bool (which will happen in the data-preprocessing section

In [None]:
# Fifth thing to check --> Total number & Percentage of missing values in TRAINING dataset

# Total number of missing values on first column
df1 = (train_df.isnull().sum()[train_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})

# Percentage of missing values on second column
df1["% of Missing Values"] = round((100*train_df.isnull().sum()[train_df.isnull().sum()>0]/len(train_df)),2)
df1

In [None]:
# Sixth thing to check --> Total number & percentage of missing values in TESTING dataset

# Total number of missing values on first column
df2 = (test_df.isnull().sum()[test_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})

# Percentage of missing values on second column
df2["% of Missing Values"] = round((100*test_df.isnull().sum()[test_df.isnull().sum()>0]/len(test_df)),2).values
df2

#### **Observation**
* We observe that there exist very little % of missing values in both training and testing data 

#### **Insights**
* We should NOT DROP but FILL/REPLACE the missing values with best suitable values from our dataset

In [None]:
# Seventh thing to check --> Cardinality(unique values) of categorical features

print("cardinality of categorical features in training datasets is:")
print(train_df.select_dtypes(include="object").nunique())

print("\n","-"*70)

print("\nCardinality of categorical features in testing datsets is:")
print(test_df.select_dtypes(include="object").nunique())

#### **Observation**
* We observe that PassengerId, Cabin, Name features of both datasets have very high cardinality

#### **Insights**
* Although USUALLY we drop the features with high cardinality, but in this project, we will perform feature engineering and create new features that represent these high-cardinality features (but in smaller categories!) ==> More data leads to better predictions!    

## Exploratory Data Analysis (EDA)

In [None]:
# Step 1 --> Visualize the target feature(the one we want to predict) => "Transported"

# Set up the figure size
plt.figure(figsize=(10,6))

# Creating pie-chart
plt.pie(train_df["Transported"].value_counts(),
        labels=train_df["Transported"].value_counts().keys(),
        autopct="%1.1f%%",
       textprops={"fontsize":20,"fontweight":"black"},
        colors=sns.color_palette("Set2"))
plt.title("Transported Feature Distribution");

#### **Observation**
* We observe that Transported Feature is highly balanced

#### **Insights**
* This means we don't ahve to use techniques such as 'under_sampling' or 'over_sampling'

In [None]:
# Step 2 --> Visualize the AGE feature

plt.figure(figsize=(16,6))
sns.histplot(x=train_df["Age"],
             hue="Transported",
             data=train_df,
             kde=True,
             palette="Set2")
plt.title("Age Feature Distribution");

#### **Observation**
* We observe that:
    * most passengers are between 18-32 years old
    * 0-18 passengers have a HIGH chance of being transported
    * 18-32 passengers have a lower chance of being transported
    * age 32 and above seem to be equally likely to be transported

#### **Insights**
* We shoudl categorize the age feature into THREE MAJOR AGE GROUPS (separated just like I stated above)

In [None]:
# Step 3 --> Visualize Expenditure Features (RoomService, FoodCourt, ShoopingMall, Spa, VRDeck)

exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

plt.figure(figsize=(14,10))
for idx,column in enumerate(exp_cols):
    plt.subplot(3,2,idx+1)
    sns.histplot(x=column, hue="Transported", data=train_df,bins=30,kde=True,palette="Set2")
    plt.title(f"{column} Distribution")
    plt.ylim(0,100)
    plt.tight_layout()

#### **Observation**

* We can observe that most of the passengers don't spend money
* Since most of the expenses are 0 so the values with higher expenses are kind of outliers in our data.
* We can observe that RoomService,Spa & VRDeck seems to have similar distributions.
* We can also observe that FoodCourt & ShoppingMall are having kind of similar distributions.
* All the expenditure features distribution is Right-Skewed.
* Passengers having less expenses are more likely to be transported than passengers having high expenses.


#### **Insights**

* Since, all expenditure features are having right-skewed distribution. So before Model Building we will transform these features to normal distribution using log-transformation
* We can create a new feature Total Expenditure indicating the total expenses of all different expenditures done by the passengers.
* Since, most people expense is 0 so we can create a new boolean feature No Spending indicating whether the passenger total expense is 0 or not.
* We can split Total Expenditure into different categories of expenditure like Low , Medium & High Expenses and create one more new feature Expenditure Category

In [None]:
# Step 4 --> Visualizing Categorical Features ("HomePlanet", "CryoSleep", "Destination", "VIP")

cat_cols = ["HomePlanet","CryoSleep","Destination","VIP"]

plt.figure(figsize=(12,20))
for idx,column in enumerate(cat_cols):
    plt.subplot(4,1,idx+1)
    sns.countplot(x=column, hue="Transported", data=train_df, palette="Set2")
    plt.title(f"{column} Distribution")
    plt.tight_layout()

#### **Observation**
* In HomePlanet feature we can observe that most of passenger are from Earth
    * passenger from Earth are Comparatively Less Transported
    * passenger from Mars are Equally Transported
    * passengers from Europa are Highly Transported
* In Destination feature we can observe that most of the passengers have a destination to Trappist-1e.
* In VIP feature we can observe that one cateogry is dominating other category too much. So it doesn't seem to be usefull feature because it can lead to overfitting in our model.

#### **Insights**
* So it's better to drop VIP feature before Model building.

#### **Something to note**

💡 We have visualized all the features expect PassengerId, Name, Cabin features. We can't visualize these features because of their high cardinality

💡 We will visualize this feature after creating new features from this old features.

## Feature Engineering

In [None]:
# Step 1 --> Create a new feature from "PassengerId" Feature

train_df["PassengerId"].head().to_frame()

### **How will we feature engineer on PassengerId**

We know => Id takes the form gggg_pp where:
* gggg indicate the group the passenger is travelling with
* pp represent the number of each people within such group

Hence, we will create a new feature called "Group_Size" -> this will indicate total number of members present in each group

We will also create a "Travelling Solo" feature to indicate whether the passenger is travelling solo or in a group

In [None]:
def passengerid_new_features(df):
    
    # Split group and member number of each passenger's "PassengerId" column
    df["Group"] = df["PassengerId"].apply(lambda x: x.split("_")[0])
    df["Member"] = df["PassengerId"].apply(lambda x: x.split("_")[1])
    
    # Grouping the "Group" feature --> so basically getting all groups alongside 
    # the total number of members within each group
    x = df.groupby("Group")["Member"].count().sort_values()
    
    # Create a set of Group values which are travelling with more than 1 member
    y = set(x[x>1].index)
    
    
    # NOW I AM CREATING THE NEW FEATURES
    # Create a new feature "Solo" which will indicate whether the person is travelling alone
    df["Travelling_Solo"] = df["Group"].apply(lambda x: x not in y)
    
    # Create a new feature "Group_size" which will indicate each group's amount of members
    df["Group_Size"] = 0
    for i in x.items():
        df.loc[df["Group"]==i[0],"Group_Size"]=i[1]

In [None]:
# Now, apply this feature engineering to train and test dataframes
passengerid_new_features(train_df)
passengerid_new_features(test_df)

**Since we don't need Group and Member features, we shall drop them now**

In [None]:
train_df.drop(columns=["Group","Member"],inplace=True)
test_df.drop(columns=["Group","Member"],inplace=True)

In [None]:
# Visualize our newly created "Group_Size" and "Travelling_Solo" features

plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
sns.countplot(x="Group_Size", hue="Transported", data=train_df,palette="Set2")
plt.title("Group_Size vs Transported")

plt.subplot(1,2,2)
sns.countplot(x="Travelling_Solo", hue="Transported", data=train_df,palette="Set2")
plt.title("Travelling Solo vs Transported")
plt.tight_layout()
plt.show()

#### **Observation**
* From Group_Size feature we can observe that most the passengers are travelling alone.
* From Travelling_Solo feature we can observe that passengers travelling solo are LESS likely to be transported

In [None]:
# Step 2 --> Create new features from "Cabin" Feature

train_df["Cabin"].head().to_frame()

### **How will we perform feature engineering on Cabin**

* We know that cabin feature consists of deck/num//side , where deck is deck loacation, num is deck_number and side can be P for port or S for Starboard.
* We can separate all these 3 values from cabin & create three new features Cabin_Deck, Cabin_Number & Cabin_Side.
* We also know that Cabin feature have NaN values so to avoid error while splitting we have to replace it in a way such that we can split those NaN Values in all three new features respectively.

In [None]:
def cabin_new_feature(df):
    
    # This step allows us to perform our splitting functions!
    df["Cabin"].fillna("np.nan/np.nan/np.nan",inplace=True)
    
    # Retrieving the Cabin features
    df["Cabin_Deck"] = df["Cabin"].apply(lambda x: x.split("/")[0])
    df["Cabin_Number"]  = df["Cabin"].apply(lambda x: x.split("/")[1])
    df["Cabin_Side"] = df["Cabin"].apply(lambda x: x.split("/")[2])
    
    #Replacing string nan values to numpy nan values..
    cols = ["Cabin_Deck","Cabin_Number","Cabin_Side"]
    df[cols]=df[cols].replace("np.nan",np.nan)
    
    #Filling Missing Values in new features created with mode, mode, median
    df["Cabin_Deck"].fillna(df["Cabin_Deck"].mode()[0],inplace=True)
    df["Cabin_Side"].fillna(df["Cabin_Side"].mode()[0],inplace=True)
    df["Cabin_Number"].fillna(df["Cabin_Number"].median(),inplace=True)

In [None]:
cabin_new_feature(train_df)
cabin_new_feature(test_df)

In [None]:
# Visualizing the "Cabin_Deck" & "Cabin_Side" Feature

plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
sns.countplot(x="Cabin_Deck",hue="Transported", data=train_df, palette="Set2",order=["A","B","C","D","E","F","G","T"])
plt.title("Cabin_Deck Distribution")

plt.subplot(1,2,2)
sns.countplot(x="Cabin_Side", hue="Transported", data=train_df, palette="Set2")
plt.title("Cabin_Side Distribution")
plt.tight_layout()
plt.show()

#### **Observation**
* From Cabin_Deck we can observe that most of the people are from F & G Deck.
* There seems to be no passengers in Cabin_Deck ,T.
* Passengers from Cabin Deck B & C have HIGH chance of being transported

* From Cabin_Side we can observe that almost half passengers were from cabin side S and half from cabin side P.
* But passenger from cabin_side S are Highly Transported but passengers from cabin_side P are Equally Transported

In [None]:
# Visualizing the "Cabin_Number" Feature

# First, we need to convert this feature into numberical values
# This is so that we can perform some Statistical Analysis on Cabin_Number feature

train_df["Cabin_Number"]=train_df["Cabin_Number"].astype(int)
test_df["Cabin_Number"]=test_df["Cabin_Number"].astype(int)

**Before visualizing let's do some Statistical analysis on Cabin_Number Feature**

In [None]:
print("Total Unique values present in Cabin_Number feature is:",train_df["Cabin_Number"].nunique())
print("The Mean of Cabin_Number Feature is: ",train_df["Cabin_Number"].mean())
print("The Median of Cabin_Number Feature is:",train_df["Cabin_Number"].median())
print("The Minimum value of Cabin_Number feature is:",train_df["Cabin_Number"].min())
print("The Maximum value of Cabin_number Feature is:",train_df["Cabin_Number"].max())

In [None]:
# Now, we visualize this feature 

plt.figure(figsize=(15,5))
sns.histplot(x="Cabin_Number",data=train_df,hue="Transported",palette="Set2")
plt.title("Cabin_Number Distribution")
plt.xticks(list(range(0,1900,300)))
plt.vlines(300,ymin=0,ymax=550,color="black")
plt.vlines(600,ymin=0,ymax=550,color="black")
plt.vlines(900,ymin=0,ymax=550,color="black")
plt.vlines(1200,ymin=0,ymax=550,color="black")
plt.vlines(1500,ymin=0,ymax=550,color="black")
plt.show()

#### **Observation**
* We can observe that Cabin_Number can be divided into different regions with group of 300 passenegrs.
* Hence, we can create a new features Cabin_Regions which will indicate passenger cabin number region.

In [None]:
# Step 3 --> Create new feature "Cabin_Regions" from "Cabin_Number"

def cabin_regions(df):
    df["Cabin_Region1"] = (df["Cabin_Number"]<300)
    df["Cabin_Region2"] = (df["Cabin_Number"]>=300) & (df["Cabin_Number"]<600)
    df["Cabin_Region3"] = (df["Cabin_Number"]>=600) & (df["Cabin_Number"]<900)
    df["Cabin_Region4"] = (df["Cabin_Number"]>=900) & (df["Cabin_Number"]<1200)
    df["Cabin_Region5"] = (df["Cabin_Number"]>=1200) & (df["Cabin_Number"]<1500)
    df["Cabin_Region6"] = (df["Cabin_Number"]>=1500)

In [None]:
cabin_regions(train_df)
cabin_regions(test_df)

**We don't need Cabin_Number Feature anymore so we will drop this feature**

In [None]:
train_df.drop(columns=["Cabin_Number"],inplace=True)
test_df.drop(columns=["Cabin_Number"],inplace=True)

In [None]:
# Visualize the "Cabin_Region" Feature

cols = ["Cabin_Region1","Cabin_Region2","Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6"]

plt.figure(figsize=(20,25))
for idx,value in enumerate(cols):
    plt.subplot(4,2,idx+1)
    sns.countplot(x=value, hue="Transported", data=train_df, palette="Set2")
    plt.title(f"{value} Distribution")
    plt.tight_layout()

#### **Observation**
* We can observe that passengers from Cabin_Region1 are Highly Transported when compared with other cabin regions.
* We can also observe that as the cabin region number is increasing passengers transport is decreasing.

In [None]:
# Step 4 --> Creating new feature from "Age"
train_df["Age"].head().to_frame()

### **How will we perform feature engineering on Age**
* As we have done EDA on Age feature we collected some insights over there that the ages can be splitted into different groups based on Transported.
* So we will create a new feature name Age Group and will split the Age into different groups on the basics of insights we gainedfrom EDA.

In [None]:
def age_group(df):
    age_group  = []
    for i in df["Age"]:
        if i<=12:
            age_group.append("Age_0-12")
        elif (i>12 and i<=18):
            age_group.append("Age_0-18")
        elif (i>18 and i<=25):
            age_group.append("Age_19-25")
        elif (i>25 and i<=32):
            age_group.append("Age_26-32")
        elif (i>32 and i<=50):
            age_group.append("Age_33_50")
        elif (i>50):
            age_group.append("age_50+")
        else:
            age_group.append(np.nan)
        
    df["Age Group"] = age_group

In [None]:
age_group(train_df)
age_group(test_df)

In [None]:
# Visualize "Age Group" feature
order = sorted(train_df["Age Group"].value_counts().keys().to_list())

plt.figure(figsize=(14,6))
sns.countplot(x="Age Group",hue="Transported", data=train_df, palette="Set2",order=order)
plt.title("Age Group Distribution");

#### **Observation**
* This new feature looks more relevent to our target data.
* Age_0-12 & Age_0-18 are more likely to be transported compared to not transported.
* Age_19-25 , Age_26_32 & Age_33_50 are less likely to be transported compared to not transported.
* Age_50+ are almost equally transported compared to not transported.

In [None]:
# Step 5 --> Creating new feautres using all expenditude features
train_df[["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]].head()

### **How will we perform feature engineering on ALL EXPENDITURE features**

When we have done EDA on this expenditure features we gained some insights as: -->
1. We can create a Total Expenditure Feature by combining all the expenditures.
2. We can create a No Spending boolean feature from Total Expenditure feature indicating True for those passengers who have spent 0 expense.
3. We can split Total Expenditure into different categories indicating whether the person is having no_expense, low_expense, medium_expense or high_expense and can create a new feature Expenditure Category.

In [None]:
exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]


# Creating the "Total Expenditure" and "No Spending" features
def new_exp_features(df):
    df["Total Expenditure"] = df[exp_cols].sum(axis=1)
    df["No Spending"] = (df["Total Expenditure"]==0)

In [None]:
new_exp_features(train_df)
new_exp_features(test_df)

In [None]:
# Visualize "Total Expenditure" Feature

plt.figure(figsize=(15,6))
sns.histplot(x="Total Expenditure", hue="Transported", data=train_df, kde=True, palette="Set2",bins=200)
plt.ylim(0,200)
plt.xlim(0,10000)
plt.title("Total Expenditure Distribution");

**Generating some statistical information from Total Expenditue feature**

In [None]:
mean = round(train_df["Total Expenditure"].mean())
median = train_df["Total Expenditure"].median()

print("Mean value of Total Expenditure feature is = ",mean)
print("Median value of Total Expenditure feature is = ",median)

#### **Observation**

* Using above measure of central tendency values we can split Total Expenditure Features into different expense categories.
* If Total Expenditure is equal to 0 then No Expense category.
* If Total Expenditure is between 1-716 then Low Expense category.
* If Total Expenditure is between 717-1441 then Medium Expense category.
* If Total Expenditure is greater thean 1441 then High Expense category.

In [None]:
def expenditure_category(df):
    expense_category = []
    
    for i in df["Total Expenditure"]:
        if i==0:
            expense_category.append("No Expense")
        elif (i>0 and i<=716):
            expense_category.append("Low Expense")
        elif (i>716 and i<=1441):
            expense_category.append("Medium Expense")
        elif (i>1441):
            expense_category.append("High Expense")
    
    df["Expenditure Category"] = expense_category

In [None]:
expenditure_category(train_df)
expenditure_category(test_df)

In [None]:
# Visualize "No Spending" & "Expenditure Category" Features

cols = ["No Spending", "Expenditure Category"]

plt.figure(figsize=(18,6))
for idx,column in enumerate(cols):
    plt.subplot(1,2,idx+1)
    sns.countplot(x=column, hue="Transported", data=train_df, palette="Set2")
    plt.title(f"{column} Distribution")
    plt.tight_layout()

#### **Observation**

* In Total Expenditure feature we can observe that passengers having low total expenses are likely to be transported more.
* In No Spending feature we can observe that passenger having No Spending are highly transported.
* In Expenditure Category feature we can confirm than passenger having No Expense are highly transported .

## Data Pre-Processing

In [None]:
# Step 1 --> Check for missing values

In [None]:
z = train_df.isnull().sum()[train_df.isnull().sum()>0].to_frame().rename(columns={0:"No. of Missing values"})
z["% of Missing values"] = round(train_df.isnull().sum()[train_df.isnull().sum()>0]*100/len(train_df),2)


z

In [None]:
# Step 2 --> Visualize missing values

In [None]:
import missingno as msno

In [None]:
msno.bar(train_df,color="C1",fontsize=22)
plt.show()

**Another way to visualize missing values**

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(train_df.isnull(),cmap="summer")
plt.show()

In [None]:
# Step 3 --> Handle missing values

# First, I separate my columns into categorical vs numerical columns (DROPPING tranposrted at the same time)
cat_cols = train_df.select_dtypes(include=["object","bool"]).columns.tolist()
cat_cols.remove("Transported")
num_cols = train_df.select_dtypes(include=["int","float"]).columns.tolist()

In [None]:
print("Categorical Columns:",cat_cols)
print("\n","-"*70)
print("\nNumerical Columns:",num_cols)

**I will use Simple Imputer Library to Fill Missing Values**

In [None]:
imputer1 = SimpleImputer(strategy="most_frequent")     ##To fill Categorical Features.
imputer2 = SimpleImputer(strategy="median")            ##To fill numeircal features.

In [None]:
def fill_missingno(df):
    df[cat_cols] = imputer1.fit_transform(df[cat_cols])
    df[num_cols] = imputer2.fit_transform(df[num_cols])

In [None]:
fill_missingno(train_df)
fill_missingno(test_df)

In [None]:
print("Missing numbers left in train_df is:",train_df.isnull().sum().sum())
print("Missing numbers left in test_df is:",test_df.isnull().sum().sum())

In [None]:
# Step 4 --> Check Duplicacy in Data
print("Duplicate values in training data is: ",train_df.duplicated().sum())
print("Duplicate values in testing data is: ",test_df.duplicated().sum())

In [None]:
# Step 5 --> Check cardinality of categorical features (I don't want a high cardinality)

print("Cardinality of features in numerical data is: ")
print(train_df.select_dtypes(include=["object"]).nunique())
print("\n","-"*50)
print("\nCardinality of features in categorical data is: ")
print(test_df.select_dtypes(include=["object"]).nunique())

#### **Observation**

We have done all feature engineering now we can drop features which have high cardinality.

So we can drop passengerId, Cabin , Name , Group and Surname features.

**Dropping Categorical Features with High Cardinality**

In [None]:
##Extracting passengerId from test data because we need this for submitting our predictions on kaggle.
pass_df = test_df[["PassengerId"]]

In [None]:
cols = ["PassengerId","Cabin","Name"]
train_df.drop(columns =cols, inplace=True)
test_df.drop(columns=cols, inplace=True)

In [None]:
# Step 6 --> Gather statistical information of numerical features

train_df.describe().T

#### **Observation**

* We can oberve in RoomService, FoodCourt, ShoppingMall, Spa & VRDeck more than 50 percentile of data are equal to 0.
* And when we did EDA on this features all of them were having right skewed distribution
* So we can simply say there is a presence of large amount of outliers in these features.
* So we can tranform these features to normal distribution using Log Transformation.
* Since, we are applying log transformation on these expenditure features so we have to apply transformation on Total Expenditure also.
* So that the model can have better understanding while finding patterns.

In [None]:
# Step 7 --> Apply Log Transformation on Expenditure Features

cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Total Expenditure']

for value in cols:
    train_df[value] = np.log(1+train_df[value])
    test_df[value]=np.log(1+test_df[value])

In [None]:
# Visualize log-transformed expenditure feautres

x=1

plt.figure(figsize=(20,35))
for i in cols:
    plt.subplot(6,2,x)
    sns.distplot(train_df[i],color="green")
    plt.ylim(0,0.2)
    plt.title(f"{i} Distribution")
    plt.tight_layout()
    x+=1

In [None]:
# Step 8 --> Double check data types

train_df.dtypes

#### **OBSERVATION**

CryoSleep , VIP, Travelling_Solo, No Spending, Cabin_Region1, Cabin_Region2, Cabin_Region3, Cabin_Region4, Cabin_Region5, Cabin_Region6 


**These features above** contains boolean values --> changing them to numerical data through encoding processes will be benefical

## Feature Encoding

* We will do **One Hot Encoding** for nominal categorical features.
* We will do **LabelEncoding** for ordinal categorical features.

In [None]:
nominal_cat_cols = ["HomePlanet","Destination"]
ordinal_cat_cols = ["CryoSleep","VIP","Travelling_Solo","Cabin_Deck","Cabin_Side","Cabin_Region1","Cabin_Region2",
                    "Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6","Age Group","No Spending",
                    "Expenditure Category"]

**Label Encoding**

In [None]:
enc = LabelEncoder()

In [None]:
train_df[ordinal_cat_cols] = train_df[ordinal_cat_cols].apply(enc.fit_transform)
test_df[ordinal_cat_cols] = test_df[ordinal_cat_cols].apply(enc.fit_transform)

**One Hot Encoding**

In [None]:
train_df = pd.get_dummies(train_df,columns=nominal_cat_cols)
test_df = pd.get_dummies(test_df,columns=nominal_cat_cols)

#### Remember: We still have one feature **Transported** left for encoding in training dataset.

In [None]:
train_df["Transported"].replace({False:0,True:1},inplace=True)

In [None]:
# Now, we check if all features are encoded
train_df.head()

In [None]:
test_df.head()

## Preparing for Model Training

In [None]:
# Select features for input output training

X = train_df.drop(columns=["Transported"])
y = train_df[["Transported"]]

In [None]:
# Scale input features to be more normalized --> this version of the data would be required by some model
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.fit_transform(test_df)

**First, we split data for model(we will train several) that do not need scaled data**

In [None]:
# Split training data for model 

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape,y_test.shape)

**Now, we split data for model that DO NEED scaled data**

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(X_scaled,y,test_size=0.2,random_state=0)

In [None]:
print(x_train1.shape, y_train1.shape)

In [None]:
print(x_test1.shape, y_test1.shape)

## Model Building for Scaled Data

In [None]:
training_score = []
testing_score = []

In [None]:
def model_prediction(model):
    model.fit(x_train1,y_train1)
    x_train_pred1 = model.predict(x_train1)
    x_test_pred1 = model.predict(x_test1)
    a = accuracy_score(y_train1,x_train_pred1)*100
    b = accuracy_score(y_test1,x_test_pred1)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)
    print("\n------------------------------------------------------------------------")
    print(f"Precision Score of {model} model is:",precision_score(y_test1,x_test_pred1))
    print(f"Recall Score of {model} model is:",recall_score(y_test1,x_test_pred1))
    print(f"F1 Score of {model} model is:",f1_score(y_test1,x_test_pred1))
    print("\n------------------------------------------------------------------------")
    print(f"Confusion Matrix of {model} model is:")
    cm = confusion_matrix(y_test1,x_test_pred1)
    plt.figure(figsize=(8,4))
    sns.heatmap(cm,annot=True,fmt="g",cmap="summer")
    plt.show()

In [None]:
# Model 1 -> Logistic Regression Model
model_prediction(LogisticRegression())

In [None]:
# Model 2 --> KNeighboursClassifier Model
model_prediction(KNeighborsClassifier())

In [None]:
# Model 3 --> Support-Vector-Classifier Model
model_prediction(SVC())

In [None]:
# Model 4 --> Naive Bayes Model
model_prediction(GaussianNB())

## Model Building for UN-SCALED data

In [None]:
def model_prediction(model):
    model.fit(x_train,y_train)
    x_train_pred = model.predict(x_train)
    x_test_pred = model.predict(x_test)
    a = accuracy_score(y_train,x_train_pred)*100
    b = accuracy_score(y_test,x_test_pred)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)
    print("\n------------------------------------------------------------------------")
    print(f"Precision Score of {model} model is:",precision_score(y_test,x_test_pred))
    print(f"Recall Score of {model} model is:",recall_score(y_test,x_test_pred))
    print(f"F1 Score of {model} model is:",f1_score(y_test,x_test_pred))
    print("\n------------------------------------------------------------------------")
    print(f"Confusion Matrix of {model} model is:")
    cm = confusion_matrix(y_test,x_test_pred)
    plt.figure(figsize=(8,4))
    sns.heatmap(cm,annot=True,fmt="g",cmap="summer")
    plt.show()

In [None]:
# Model 1 --> Decision Tree Classifier
model_prediction(DecisionTreeClassifier())

In [None]:
# Model 2 --> Random Forest Classifier
model_prediction(RandomForestClassifier())

In [None]:
# Model 3 --> Ada Boost Classifier Model
model_prediction(AdaBoostClassifier())

In [None]:
# Model 4 --> Gradient Boosting Classifier Model
model_prediction(GradientBoostingClassifier())

In [None]:
# Model 5 --> LGMB Classifier Model
model_prediction(LGBMClassifier())

In [None]:
# Model 6 --> XGBClassifier Model
model_prediction(XGBClassifier())

In [None]:
# Model 7 --> Cat Boost Classifier Model
model_prediction(CatBoostClassifier(verbose=False))

## ALL MODEL PERFORMANCE COMPARISON!

In [None]:
models = ["Logistic Regression","KNN","SVM","Naive Bayes","Decision Tree","Random Forest","Ada Boost",
          "Gradient Boost","LGBM","XGBoost","CatBoost"]

In [None]:
models

In [None]:
df = pd.DataFrame({"Algorithms":models,
                   "Training Score":training_score,
                   "Testing Score":testing_score})

In [None]:
df

In [None]:
# Plotting the results above using a column-bar chart

df.plot(x="Algorithms",y=["Training Score","Testing Score"], figsize=(16,6),kind="bar",
        title="Performance Visualization of Different Models",colormap="Set1")
plt.show()

#### **Observation**
* Highest performance was give by LGBM near to 82%.
* But RandomForest,XgBoost, & catBoost Model performance was also good.
* So we will do Hyper-Parameter Tunning on these four Models.

## Hyper-Parameter Tuning of LGBM Model

In [None]:
model1 = LGBMClassifier()


In [None]:
parameters1 = {"n_estimators":[100,300,500,600,650],
              "learning_rate":[0.01,0.02,0.03],
              "random_state":[0,42,48,50],
               "num_leaves":[16,17,18]}

In [None]:
grid_search1 = GridSearchCV(model1, parameters1, cv=5, n_jobs=-1)


In [None]:
grid_search1.fit(x_train,y_train.values.ravel())


In [None]:
grid_search1.best_score_


In [None]:
best_parameters1 = grid_search1.best_params_
best_parameters1

**Creating LGBM Model Using Best Parameters.**

In [None]:
model1 = LGBMClassifier(**best_parameters1)


In [None]:
model1.fit(x_train,y_train)


In [None]:
x_test_pred1 = model1.predict(x_test)


In [None]:
accuracy_score(y_test,x_test_pred1)


## Hyper-Parameter Tuning of CatBoost Model

In [None]:
model2 = CatBoostClassifier(verbose=False)

In [None]:
parameters2 = {"learning_rate":[0.1,0.3,0.5,0.6,0.7],
              "random_state":[0,42,48,50],
               "depth":[8,9,10],
               "iterations":[35,40,50]}

In [None]:
grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1)


In [None]:
grid_search2.fit(x_train,y_train)


In [None]:
grid_search2.best_score_


In [None]:
best_parameters2 = grid_search2.best_params_
best_parameters2

**Creating Cat Boost Model Using Best Parameters**

In [None]:
model2 = CatBoostClassifier(**best_parameters2,verbose=False)


In [None]:
model2.fit(x_train,y_train)

In [None]:
x_test_pred2 = model2.predict(x_test)


In [None]:
accuracy_score(y_test,x_test_pred2)


## Hyper-Parameter Tuning of XGBoost Model

In [None]:
model3 = XGBClassifier()


In [None]:
parameters3 = {"n_estimators":[50,100,150],
             "random_state":[0,42,50],
             "learning_rate":[0.1,0.3,0.5,1.0]}

In [None]:
grid_search3 = GridSearchCV(model3, parameters3 , cv=5, n_jobs=-1)


In [None]:
grid_search3.fit(x_train,y_train)


In [None]:
grid_search3.best_score_


In [None]:
best_parameters3 = grid_search3.best_params_
best_parameters3

**Creating XGBoost Model Using Best Parameters**

In [None]:
model3 = XGBClassifier(**best_parameters3)


In [None]:
model3.fit(x_train,y_train)


In [None]:
x_test_pred3 = model3.predict(x_test)


In [None]:
accuracy_score(y_test,x_test_pred3)


## Hyper Parameter TUning of Random Forest Model

In [None]:
model4 = RandomForestClassifier()


In [None]:
parameters4 = {'n_estimators': [100,300,500,550],
               'min_samples_split':[7,8,9],
               'max_depth': [10,11,12], 
               'min_samples_leaf':[4,5,6]}
    

In [None]:
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1)


In [None]:
grid_search4.fit(x_train,y_train.values.ravel())


In [None]:
grid_search4.best_score_


In [None]:
best_parameters4 = grid_search4.best_params_
best_parameters4

**Creating Random Forest Model Using Best Parameters**

In [None]:
model4 = RandomForestClassifier(**best_parameters4)


In [None]:
model4.fit(x_train,y_train)


In [None]:
x_test_pred4 = model4.predict(x_test)


In [None]:
accuracy_score(y_test,x_test_pred4)


## STACKING CLASSIFIER MODELS!

In [None]:
stacking_model = StackingClassifier(estimators=[('LGBM', model1), 
                                                ('CAT Boost', model2),
                                                ("XGBoost", model3),
                                                ('RF', model4)])

In [None]:
stacking_model.fit(x_train, y_train)


In [None]:
x_train_pred5 = stacking_model.predict(x_train)


In [None]:
x_test_pred5 = stacking_model.predict(x_test)


In [None]:
print("Stacking Model accuracy on Training Data is:",accuracy_score(y_train,x_train_pred5)*100)


In [None]:
print("Stacking Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred5)*100)


## PREDICTING TEST DATA!

In [None]:
pred = stacking_model.predict(test_df)


## SUBMISSION - DATA FORMAT!

In [None]:
pass_df.head()


In [None]:
pass_df["Transported"] = pred


In [None]:
pass_df.head()


In [None]:
pass_df["Transported"].replace({1:True,0:False},inplace=True)


In [None]:
pass_df.head()


In [None]:
pass_df.shape


**Submission File**

In [None]:
pass_df.to_csv("spaceship_prediction_project.csv",index=False)
