In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Binary Classification on Titanic Spaceship Dataset

### Importing Libraries

In [None]:
import ydata_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.rows",None)
pd.set_option("display.max.columns",None)

#from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB
#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

from imblearn.over_sampling import SMOTE

### Loading the Datasets

In [None]:
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

## Data Understanding

In [None]:
# Taking a look at the Dimensions of the Datasets
print("train_df shape is: ",train_df.shape)
print("testing_df shape is: ",test_df.shape)

In [None]:
# Taking a look at the datatypes on each dataset
print("Training Dataset Feature Datatypes:")
print(train_df.dtypes)
print("\n"+"-"*35)
print("\nTesting Dataset Feature Datatypes:")
print(test_df.dtypes)
print("\n"+"-"*35)
print("\nTraining Dataset info:")
print(train_df.info())

In [None]:
# Taking a sneak peek at the training dataset
print("Training Dataset")
train_df.head()

In [None]:
# Taking another sneak peek at the testing dataset
print("Testing Dataset")
test_df.head()

In [None]:
#checking both datasets for duplicates
print(f"{train_df.duplicated().sum()} duplicates in the Training Dataset")
print(f"{test_df.duplicated().sum()} duplicates in the Testing Dataset")

In [None]:
# Checking the Missing values on each feature in the Training Dataset
Missing_Values = (train_df.isnull().sum()[train_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})
Missing_Values["Number of Values"] = train_df.count()
Missing_Values["% of Missing Values"] = round((100*train_df.isnull().sum()[train_df.isnull().sum()>0]/len(train_df)),2)
Missing_Values

In [None]:
# Checking the Missing values on each feature in the Testing Dataset
Missing_Values = (test_df.isnull().sum()[test_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})
Missing_Values["Number of Values"] = test_df.count()
Missing_Values["% of Missing Values"] = round((100*test_df.isnull().sum()[test_df.isnull().sum()>0]/len(test_df)),2)
Missing_Values

## EDA using Pandas Profiling

In [None]:
pp.ProfileReport(train_df)

## Feature Engineering

In [None]:
# Creating a function that creates New Features using the "PassengerId" Feature

def passengerid_new_features(*dfs):
    for df in dfs:
        #Splitting Group and Member values from "PassengerId" feature.
        df["Group"] = df["PassengerId"].apply(lambda x: x.split("_")[0])
        df["Member"] =df["PassengerId"].apply(lambda x: x.split("_")[1])

        #Grouping the "Group" feature with respect to "member" feature to check which group is travelling with how many members
        x = df.groupby("Group")["Member"].count().sort_values()

        #Creating a set of group values which are travelling with more than 1 members.
        y = set(x[x>1].index)

        #Creating a new feature "Solo" which will indicate whether the person is travelling solo or not.
        df["Travelling_Solo"] = df["Group"].apply(lambda x: x not in y)

        #Creating a new feature "Group_size" which will indicate each group number of members.
        df["Group_Size"]=0
        for i in x.items():
            df.loc[df["Group"]==i[0],"Group_Size"]=i[1]

In [None]:
# Using the function created on both the training and testing datasets

passengerid_new_features(train_df, test_df)

In [None]:
train_df.head()

In [None]:
# Dropping features "Group" and "Member" since they carry information we dont need at the moment
train_df.drop(columns=["Group","Member"],inplace=True)
test_df.drop(columns=["Group","Member"],inplace=True)

In [None]:
# Creating a function that creates New Features using the "Cabin" Feature

def cabin_new_feature(*dfs):
    for df in dfs:
        df["Cabin"].fillna("np.nan/np.nan/np.nan", inplace=True)
        
        df["Cabin_Deck"] = df["Cabin"].apply(lambda x: x.split("/")[0])
        df["Cabin_Number"] = df["Cabin"].apply(lambda x: x.split("/")[1])
        df["Cabin_Side"] = df["Cabin"].apply(lambda x: x.split("/")[2])
        
        cols = ["Cabin_Deck", "Cabin_Number", "Cabin_Side"]
        df[cols] = df[cols].replace("np.nan", np.nan)
        
        df["Cabin_Deck"].fillna(df["Cabin_Deck"].mode()[0], inplace=True)
        df["Cabin_Side"].fillna(df["Cabin_Side"].mode()[0], inplace=True)
        df["Cabin_Number"].fillna(df["Cabin_Number"].median(), inplace=True)


In [None]:
# Now, we are using this function on both datasets
cabin_new_feature(train_df, test_df)

In [None]:
train_df.head()

In [None]:
# Creating a function that creates a new Feature "Cabin_Regions" From "Cabin_Number"

def cabin_regions(*dfs):
    for df in dfs:
        df["Cabin_Region1"] = df["Cabin_Number"].astype(float) < 300
        df["Cabin_Region2"] = (df["Cabin_Number"].astype(float) >= 300) & (df["Cabin_Number"].astype(float) < 600)
        df["Cabin_Region3"] = (df["Cabin_Number"].astype(float) >= 600) & (df["Cabin_Number"].astype(float) < 900)
        df["Cabin_Region4"] = (df["Cabin_Number"].astype(float) >= 900) & (df["Cabin_Number"].astype(float) < 1200)
        df["Cabin_Region5"] = (df["Cabin_Number"].astype(float) >= 1200) & (df["Cabin_Number"].astype(float) < 1500)
        df["Cabin_Region6"] = df["Cabin_Number"].astype(float) >= 1500

In [None]:
# Run the function on both datasets
cabin_regions(train_df, test_df)

In [None]:
train_df.head()

In [None]:
# The "Cabin_Number" Feature carries information we dont need anymore so we are going to drop it on both datasets
train_df.drop(columns=["Cabin_Number"],inplace=True)
test_df.drop(columns=["Cabin_Number"],inplace=True)

In [None]:
# # Creating a function that creates a new Feature "Age_Group" From "Age"
def age_group(*dfs):
    for df in dfs:
        bins = [0, 12, 18, 25, 32, 50, float('inf')]  # Define the age group bins
        labels = ["0-12", "12-18", "19-25", "26-32", "33-50", "50+"]  # Labels for the age groups
        
        df["Age Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

In [None]:
age_group(train_df, test_df)

In [None]:
train_df.head()

In [None]:
# Creating New Features Using All Expenditude Features

exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

def new_exp_features(*dfs):
    for df in dfs:
        df["Total Expenditure"] = df[exp_cols].sum(axis=1)
        df["No Spending"] = (df["Total Expenditure"]==0)

In [None]:
new_exp_features(train_df, test_df)

In [None]:
train_df.head()

In [None]:
# Generating some statistical information from Total Expenditue feature

stat_cols = ["PassengerId", "Total Expenditure", "No Spending"]
stat_df = train_df[stat_cols]

pp.ProfileReport(stat_df)

In [None]:
# Using the measures of central tendency to categorise each expense

def expenditure_category(*dfs):
    for df in dfs:
        bins = [-1, 0, 716, 1441, float('inf')]  # Define the expenditure category bins using Q1, Median, and Q3
        labels = ["No Expense", "Low Expense", "Medium Expense", "High Expense"]  # Labels for the categories
        
        df["Expenditure Category"] = pd.cut(df["Total Expenditure"], bins=bins, labels=labels, right=False)

In [None]:
expenditure_category(train_df, test_df)

In [None]:
train_df.head()

## Data PreProcessing

### Visualizing Missing Numbers

In [None]:
msno.bar(train_df,color="C3",fontsize=22)
plt.show()

### Handling Missing Values

In [None]:
# Selecting the categorical features and excluding the "Transported" Feature
categorical_cols = train_df.select_dtypes(include=["object","bool"]).columns.tolist()
categorical_cols.remove("Transported")
num_cols = train_df.select_dtypes(include=["int","float"]).columns.tolist()

print("Categorical Columns:",categorical_cols)
print("\n","-"*35)
print("\nNumerical Columns:",num_cols)

In [None]:
# Using Simple Imputer Library to Fill Missing Values
imputer1 = SimpleImputer(strategy="most_frequent")     ##To fill Categorical Features.
imputer2 = SimpleImputer(strategy="median")            ##To fill numeircal features.

In [None]:
def fill_missingno(*dfs):
    for df in dfs:
        df[categorical_cols] = imputer1.fit_transform(df[categorical_cols])
        df[num_cols] = imputer2.fit_transform(df[num_cols])

In [None]:
# Running the Missing Values handling function
fill_missingno(train_df, test_df)

### Visualizing, and checking if we still have missing values

In [None]:
msno.bar(train_df,color="C3",fontsize=22)
plt.show()

As you can see we still have missing values on the "Age Group" feature we created, looks like we have have to re-run the age-group function on this dataset. 

In [None]:
# Drop the Age_Group feature
train_df.drop(columns=["Age Group"],inplace=True)
test_df.drop(columns=["Age Group"],inplace=True)

#Re-run the Age Group function
age_group(train_df, test_df)

In [None]:
#Checking again for missing values
msno.bar(train_df,color="C3",fontsize=22)
plt.show()

Looks we have handled all missing values

### Checking for Duplicates in Data

In [None]:
print("In training data is: ",train_df.duplicated().sum())
print("In testing data is: ",test_df.duplicated().sum())

In [None]:
pp.ProfileReport(train_df)

Based on the summary we got from Pandas_Profiling

**PassengerId** has a high cardinality: 8693 distinct values	***High cardinality***

**Cabin** has a high cardinality: 6561 distinct values	***High cardinality***

**Name** has a high cardinality: 8473 distinct values	***High cardinality***

In [None]:
# Dropping Categorical Features with High Cardinality

Test_PassengerId = test_df[["PassengerId"]]#Saving the passengerId feature from test data because we need this for submitting our predictions on kaggle

cols = ["PassengerId","Cabin","Name"]
train_df.drop(columns =cols, inplace=True)
test_df.drop(columns=cols, inplace=True)

In [None]:
# Applying Log Transformation on Expenditure Features, which helps with handling the skewed dataset

cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Total Expenditure']

for value in cols:
    train_df[value] = np.log(1+train_df[value])
    test_df[value]=np.log(1+test_df[value])

In [None]:
# Changing Data-Type to Boolean

cols = ["CryoSleep","VIP","Travelling_Solo","No Spending","Cabin_Region1","Cabin_Region2","Cabin_Region3","Cabin_Region4",
       "Cabin_Region5","Cabin_Region6"]

train_df[cols] = train_df[cols].astype(bool)
test_df[cols] = test_df[cols].astype(bool)

### Feature Encoding

In [None]:
nominal_cat_cols = ["HomePlanet","Destination"]
ordinal_cat_cols = ["CryoSleep","VIP","Travelling_Solo","Cabin_Deck","Cabin_Side","Cabin_Region1","Cabin_Region2",
                    "Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6","Age Group","No Spending",
                    "Expenditure Category"]

In [None]:
# Label Encoding
enc = LabelEncoder()
train_df[ordinal_cat_cols] = train_df[ordinal_cat_cols].apply(enc.fit_transform)
test_df[ordinal_cat_cols] = test_df[ordinal_cat_cols].apply(enc.fit_transform)

In [None]:
# One Hot Encoding
train_df = pd.get_dummies(train_df,columns=nominal_cat_cols)
test_df = pd.get_dummies(test_df,columns=nominal_cat_cols)

In [None]:
train_df.columns

In [None]:
# Still have one feature Transported left for encoding in training dataset
train_df["Transported"].replace({False:0,True:1},inplace=True)

### Selecting Features & Labels For Model Training.

In [None]:
X = train_df.drop(columns=["Transported"])
y = train_df[["Transported"]]

### Feature Scaling

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.fit_transform(test_df)

### Splitting Data For Model w/ Unscaled Data

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(x_train.shape, y_train.shape)
print(x_test.shape,y_test.shape)

### Splitting Data for Model w/ Scaled Data

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(X_scaled,y,test_size=0.2,random_state=0)
print(x_train1.shape, y_train1.shape)
print(x_test1.shape, y_test1.shape)

## Model Building For Scaled Data

In [None]:
training_score = []
testing_score = []

In [None]:
# Model Building function
def model_prediction_S(model):
    model.fit(x_train1,y_train1)
    x_train_pred1 = model.predict(x_train1)
    x_test_pred1 = model.predict(x_test1)
    a = accuracy_score(y_train1,x_train_pred1)*100
    b = accuracy_score(y_test1,x_test_pred1)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)
    print("\n------------------------------------------------------------------------")
    print(f"Precision Score of {model} model is:",precision_score(y_test1,x_test_pred1))
    print(f"Recall Score of {model} model is:",recall_score(y_test1,x_test_pred1))
    print(f"F1 Score of {model} model is:",f1_score(y_test1,x_test_pred1))
    print("\n------------------------------------------------------------------------")
    print(f"Confusion Matrix of {model} model is:")
    cm = confusion_matrix(y_test1,x_test_pred1)
    plt.figure(figsize=(8,4))
    sns.heatmap(cm,annot=True,fmt="g",cmap="summer")
    plt.show()

### 1. Multi-layer Perceptron Classifier Model

In [None]:
clf = MLPClassifier(
    solver='adam',
    learning_rate_init = 0.001,
    alpha=1e-5,
    hidden_layer_sizes=(100, 50),
    activation='relu',
    random_state=1,
    batch_size = 32,
    max_iter = 200,
    early_stopping=True,
    n_iter_no_change = 10,
    learning_rate = 'adaptive'
)

In [None]:
model_prediction_S(clf)

## Performing ***hyperparameter tuning*** using a technique called GridSearchCV

In [None]:
def model_prediction_with_tuning(model):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'hidden_layer_sizes': [(100, 50), (200, 100), (50, 25)],
        'alpha': [1e-6,1e-5, 1e-4, 1e-3],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'batch_size': [32,64],
        'early_stopping':[True],
        'n_iter_no_change':[10,20]
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
    )
    # Convert DataFrame to NumPy array
    y_train1_np = np.array(y_train1)

    # Fit the grid search to the training data
    grid_search.fit(x_train1, y_train1_np.ravel())

    # Retrieve the best hyperparameters and best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Use the best model for prediction
    x_train_pred1 = best_model.predict(x_train1)
    x_test_pred1 = best_model.predict(x_test1)

    # Compute and store accuracy scores
    a = accuracy_score(y_train1, x_train_pred1) * 100
    b = accuracy_score(y_test1, x_test_pred1) * 100
    training_score.append(a)
    testing_score.append(b)

    # Print results, including best hyperparameters
    print(f"Best hyperparameters: {best_params}")
    print(f"Accuracy_Score of {best_model} model on Training Data is:", a)
    print(f"Accuracy_Score of {best_model} model on Testing Data is:", b)
    # (print other metrics and confusion matrix)


In [None]:
model_prediction_with_tuning(clf)

### Model Building For Unscaled Data

In [None]:
# Model Building function x_train,x_test,y_train,y_test
def model_prediction_US(model):
    model.fit(x_train,y_train)
    x_train_pred = model.predict(x_train)
    x_test_pred = model.predict(x_test)
    a = accuracy_score(y_train,x_train_pred)*100
    b = accuracy_score(y_test,x_test_pred)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)
    print("\n------------------------------------------------------------------------")
    print(f"Precision Score of {model} model is:",precision_score(y_test,x_test_pred))
    print(f"Recall Score of {model} model is:",recall_score(y_test,x_test_pred))
    print(f"F1 Score of {model} model is:",f1_score(y_test,x_test_pred))
    print("\n------------------------------------------------------------------------")
    print(f"Confusion Matrix of {model} model is:")
    cm = confusion_matrix(y_test,x_test_pred)
    plt.figure(figsize=(8,4))
    sns.heatmap(cm,annot=True,fmt="g",cmap="summer")
    plt.show()

In [None]:
model_prediction_US(clf)

## Performing ***hyperparameter tuning*** using a technique called GridSearchCV

In [None]:
def model_prediction_with_tuning_US(model):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'hidden_layer_sizes': [(100, 50), (200, 100), (50, 25)],
        'alpha': [1e-6,1e-5, 1e-4, 1e-3],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'batch_size': [32,64],
        'early_stopping':[True],
        'n_iter_no_change':[10,20]
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
    )
    # Convert DataFrame to NumPy array
    y_train_np = np.array(y_train)

    # Fit the grid search to the training data
    grid_search.fit(x_train, y_train_np.ravel())

    # Retrieve the best hyperparameters and best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Use the best model for prediction
    x_train_pred = best_model.predict(x_train)
    x_test_pred = best_model.predict(x_test)

    # Compute and store accuracy scores
    a = accuracy_score(y_train, x_train_pred) * 100
    b = accuracy_score(y_test, x_test_pred) * 100
    training_score.append(a)
    testing_score.append(b)

    # Print results, including best hyperparameters
    print(f"Best hyperparameters: {best_params}")
    print(f"Accuracy_Score of {best_model} model on Training Data is:", a)
    print(f"Accuracy_Score of {best_model} model on Testing Data is:", b)
    # (print other metrics and confusion matrix)


In [None]:
model_prediction_with_tuning_US(clf)

# Submission- Data Format

In [None]:
pred = clf.predict(test_df)

pred

In [None]:
Test_PassengerId.head()

In [None]:
Test_PassengerId["Transported"] = pred

In [None]:
Test_PassengerId.head()

In [None]:
Submission_Dataset = Test_PassengerId
Submission_Dataset.head()

In [None]:
Submission_Dataset["Transported"].replace({1:True,0:False},inplace=True)

In [None]:
Submission_Dataset.head()

In [None]:
Submission_Dataset.shape

In [None]:
Submission_Dataset.to_csv("spaceship_prediction_DS_Project.csv",index=False)