# Import Necessary Libraries

In [None]:
# Data Wrangling libraries
import numpy as np
import pandas as pd
import scipy.stats as stats

# Visualization Libraries
from IPython.display import display,HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Machine Learning Estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Visualization Library
from visualizationfunctions import *

# Importing the dataset

In [None]:
# Importing the dataset
spaceship_train_data = pd.read_csv('../input/spaceship-titanic/train.csv')
spaceship_test_data = pd.read_csv('../input/spaceship-titanic/test.csv')
spaceship_train_data.head()

# Data Preprocessing

## Filling in missing values

In [None]:
# Check for missing values
spaceship_test_data.isna().sum()

In [None]:
# Function for filling up missing values
def fill_null_values(data, strategy='constant', fill_value=None):
    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy, fill_value=fill_value)
    return imputer.fit_transform(data)

### CryoSleep and VIP Column (Boolean Columns)

In [None]:
COLUMN_NAME = 'CryoSleep'

# Check for null values
print(f"Missing Values: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

# Check for most frequent values 
print(f"\nValue Counts:\n{spaceship_train_data[COLUMN_NAME].value_counts()}")

# Fill in the most frequent value (False to the null records)
spaceship_train_data[COLUMN_NAME] = fill_null_values(data=spaceship_train_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value=False)

spaceship_test_data[COLUMN_NAME] = fill_null_values(data=spaceship_test_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value=False)

# Ensure that there are no values
print(f"\nNumber of missing values after imputing: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

In [None]:
COLUMN_NAME = 'VIP'

# Check for null values
print(f"Missing Values: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

# Check for most frequent values 
print(f"\nValue Counts:\n{spaceship_train_data[COLUMN_NAME].value_counts()}")

# Fill in the most frequent value (False to the null records)
spaceship_train_data[COLUMN_NAME] = fill_null_values(data=spaceship_train_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value=False)

spaceship_test_data[COLUMN_NAME] = fill_null_values(data=spaceship_test_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value=False)

# Ensure that there are no values
print(f"\nNumber of missing values after imputing: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

### Cabin Column


In [None]:
COLUMN_NAME = 'Cabin'

# Find the most frequent deck
deck = spaceship_train_data[spaceship_train_data[COLUMN_NAME].isna() == False][COLUMN_NAME].apply(lambda x: str(x).split('/')[0])
print(f'Most frequent deck letter: {deck.value_counts().index[0]}')

# Find the median of the cabin numbers
cabin_nums = spaceship_train_data[spaceship_train_data[COLUMN_NAME].isna() == False][COLUMN_NAME].apply(lambda x: str(x).split('/')[1])
print(f"Median Cabin number: {cabin_nums.astype('float').median()}")

# Find the most frequent side
side = spaceship_train_data[spaceship_train_data[COLUMN_NAME].isna() == False][COLUMN_NAME].apply(lambda x: str(x).split('/')[2])
print(f'Most frequent side letter: {side.value_counts().index[0]}')

In [None]:
# Check for null values
print(f"Missing Values: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

# Fill in the value 'F/427/S' to null records
spaceship_train_data[COLUMN_NAME] = fill_null_values(data=spaceship_train_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='F/427/S')

spaceship_test_data[COLUMN_NAME] = fill_null_values(data=spaceship_test_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='F/427/S')

# Ensure that there are no values
print(f"\nNumber of missing values after imputing: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

### Destination and HomePlanet Column

In [None]:
COLUMN_NAME = 'Destination'

# Check for null values
print(f"Missing Values: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

# Check for most frequent values 
print(f"\nValue Counts:\n{spaceship_train_data[COLUMN_NAME].value_counts()}")

# Fill in the most frequent value (TRAPPIST-1e to the null records)
spaceship_train_data[COLUMN_NAME] = fill_null_values(data=spaceship_train_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='TRAPPIST-1e')

spaceship_test_data[COLUMN_NAME] = fill_null_values(data=spaceship_test_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='TRAPPIST-1e')

# Ensure that there are no values
print(f"\nNumber of missing values after imputing: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

In [None]:
COLUMN_NAME = 'HomePlanet'

# Check for null values
print(f"Missing Values: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

# Check for most frequent values 
print(f"\nValue Counts:\n{spaceship_train_data[COLUMN_NAME].value_counts()}")

# Fill in the most frequent value (Earth to the null records)
spaceship_train_data[COLUMN_NAME] = fill_null_values(data=spaceship_train_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='Earth')

spaceship_test_data[COLUMN_NAME] = fill_null_values(data=spaceship_test_data[[COLUMN_NAME]],
                                                     strategy='constant',
                                                     fill_value='Earth')

# Ensure that there are no values
print(f"\nNumber of missing values after imputing: {spaceship_train_data[COLUMN_NAME].isna().sum()}")

### Numerical Columns

In [None]:
spaceship_train_data['Age'] = fill_null_values(data=spaceship_train_data[['Age']], strategy='mean')
spaceship_train_data['RoomService'] = fill_null_values(data=spaceship_train_data[['RoomService']], strategy='mean')
spaceship_train_data['ShoppingMall'] = fill_null_values(data=spaceship_train_data[['ShoppingMall']], strategy='mean')
spaceship_train_data['Spa'] = fill_null_values(data=spaceship_train_data[['Spa']], strategy='mean')
spaceship_train_data['VRDeck'] = fill_null_values(data=spaceship_train_data[['VRDeck']], strategy='mean')
spaceship_train_data['FoodCourt'] = fill_null_values(data=spaceship_train_data[['FoodCourt']], strategy='mean')

spaceship_test_data['Age'] = fill_null_values(data=spaceship_test_data[['Age']], strategy='mean')
spaceship_test_data['RoomService'] = fill_null_values(data=spaceship_test_data[['RoomService']], strategy='mean')
spaceship_test_data['ShoppingMall'] = fill_null_values(data=spaceship_test_data[['ShoppingMall']], strategy='mean')
spaceship_test_data['Spa'] = fill_null_values(data=spaceship_test_data[['Spa']], strategy='mean')
spaceship_test_data['VRDeck'] = fill_null_values(data=spaceship_test_data[['VRDeck']], strategy='mean')
spaceship_test_data['FoodCourt'] = fill_null_values(data=spaceship_test_data[['FoodCourt']], strategy='mean')

In [None]:
spaceship_test_data.isna().sum()

## Drop Unnecessary Columns


In [None]:
# Drop Name and PassengerId
spaceship_train_data.drop(['Name', 'PassengerId'], axis=1, inplace=True)
spaceship_test_data.drop(['Name', 'PassengerId'], axis=1, inplace=True)

In [None]:
spaceship_train_data.head()

## Converting all categories into numbers and Normalize the data

In [None]:
# Import sklearn libraries for category -> number conversio

def preprocessing(data, training=False):
    
    if training:
        data['Transported'] = data['Transported'].apply(lambda x: int(x))
        
    data['VIP'] = data['VIP'].apply(lambda x: int(x))
    data['CryoSleep'] = data['CryoSleep'].apply(lambda x: int(x))

    data['Deck'] = data['Cabin'].apply(lambda x: str(x).split('/')[0])
    data['Side'] = data['Cabin'].apply(lambda x: str(x).split('/')[2])
    data['Cabin'] = data['Cabin'].apply(lambda x: int(str(x).split('/')[1]))
    
    data = pd.get_dummies(data, columns=['HomePlanet', 'Destination', 'Deck', 'Side'], drop_first=True)

    return data

In [None]:
# Apply the preprocessing function
spaceship_train_data = preprocessing(spaceship_train_data, True)
spaceship_test_data = preprocessing(spaceship_test_data)

# Exploratry Data Analysis

In [None]:
# Display the train dataset
display(HTML(spaceship_train_data.head().to_html()))

In [None]:
# View Dataset Statistics
display(HTML(spaceship_train_data.describe().to_html()))

In [None]:
# Convert target values (Transported) as 1:True and 0:False
spaceship_train_data['Transported'].apply(lambda x: 1 if True else 0)

transported_true = [i for i,val in enumerate(spaceship_train_data['Transported']) if val==1] #indices of true cases
n_vertical = 5 #vertical resolution of the contour data
X = spaceship_train_data.drop('Transported', axis=1)
    
plot_countor_map(spaceship_train_data, transported_true, n_vertical, X)

In [None]:
sns.set(style = 'whitegrid', rc = {'figure.figsize': (20,15)})
plot_heatmap(
    height=15,
    data = spaceship_train_data.corr(),
    title = 'Spaceship Train Dataset Correlation Overview',
    subtitle = 'Method of correlation: Pearson Correlation Coefficient'
);

In [None]:
np.seterr(divide='ignore', invalid='ignore')
sns.set(style = 'whitegrid',
            rc = {'figure.figsize': (20,10)})

anv = anova(spaceship_train_data, 'Transported', title='Feature Disparity')

In [None]:
sns.set(style = 'whitegrid',
            rc = {'figure.figsize': (20, 10)})
sns.despine(left=True, bottom=True)
    
qualitative = spaceship_train_data.drop('Transported', axis=1).columns.to_list()
target = 'Transported'
spearman(spaceship_train_data, 
         qualitative, target, 
         'Spearman Correlation',
         'Correlation analysis for spaceship titanic features with target variable (Transported)')

# Training and Evaluation using Different Estimators

## Splitting Spaceship Train Data into Training and Validation sets

In [None]:
X = spaceship_train_data.drop('Transported', axis=1)
y = spaceship_train_data['Transported']
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.2, 
                                                  stratify=spaceship_train_data['Transported'])

In [None]:
sns.set(style = 'whitegrid',
            rc = {'figure.figsize': (20,5)})
plot_countplot(y = y_train.astype(str).replace({"0": "Not Transported [0]", "1": "Transported [1]"}),
               title = 'Countplot of Target variable (Transported) from the spaceship train dataset',
               height = 3);

In [None]:
sns.set(style = 'whitegrid',
            rc = {'figure.figsize': (20,5)})
plot_countplot(y = y_val.astype(str).replace({"0": "Not Transported [0]", "1": "Transported [1]"}), 
               title = 'Countplot of Target variable (Transported) from the spaceship validation dataset',
               height = 3);

## K Nearest Neighbors (KNN)
The k-nearest neighbors algorithm, also known as KNN or k-NN, is a non-parametric, supervised learning classifier, which uses proximity to make classifications or predictions about the grouping of an individual data point.

In [None]:
n_neighbors = [4, 5, 6]
fig,axes = plt.subplots(1,3,figsize = (15,5)) #create figure
perfs = []
predictions = []

for i,(n,ax) in enumerate(zip(n_neighbors,axes)):

    #create and train the model
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(X_train,y_train)

    #create predictions of testing data and store these predictions
    y_pred = model.predict(X_val)
    predictions.append(y_pred)

    #get training testing accuracy scores and store in a list
    train_score = model.score(X_train,y_train)
    val_score =  model.score(X_val,y_val) 

    perfs.append((train_score,val_score))
    plot_confusion_matrix(y_val, y_pred, train_score, val_score, hp_name=f'Neighbors = {n}', ax=ax, i=i)

plt.suptitle(f"Confusion Matrices (K-Nearest Neighbors)")
plt.tight_layout();

In [None]:
# Predict using KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=5) # Initialize KNN with the hyperparameters from the best performing model
y_pred = knn_model.fit(X_train, y_train).predict(X_val)

sns.set(style = 'whitegrid', rc = {'figure.figsize': (20,15)})
sns.set_palette('hls')
    
knn_results = graph_classification_reports(y_pred, y_val, title='KNN Model Classification Reports')

In [None]:
plot_model_performance(parameters=n_neighbors,
                       title='Training and Testing Model Performances',
                       perfs=perfs,
                       x_labels='KNN Model')

## Logistic Regression
In statistics, the logistic model is a statistical model that models the probability of one event taking place by having the log-odds for the event be a linear combination of one or more independent variables. In regression analysis, logistic regression is estimating the parameters of a logistic model.

In [None]:
C = [0.1, 1, 10]
fig,axes = plt.subplots(1,3,figsize = (15,5)) #create figure
perfs = []
predictions = []

for i,(c,ax) in enumerate(zip(C,axes)):

    #create and train the model
    model = LogisticRegression(C=c, max_iter=10000)
    model.fit(X_train,y_train)

    #create predictions of testing data and store these predictions
    y_pred = model.predict(X_val)
    predictions.append(y_pred)

    #get training testing accuracy scores and store in a list
    train_score = model.score(X_train,y_train)
    val_score =  model.score(X_val,y_val) 

    perfs.append((train_score,val_score))
    plot_confusion_matrix(y_val, y_pred, train_score, val_score, hp_name=f'C = {c}', ax=ax, i=i)

plt.suptitle(f"Confusion Matrices (Logistic Regression)")
plt.tight_layout();

In [None]:
# Predict using Logistic Regression
log_reg = LogisticRegression(C=0.1, max_iter=10000) # Initialize Logistic Regression with the hyperparameters from the best performing model
y_pred = log_reg.fit(X_train, y_train).predict(X_val)

log_reg_results = graph_classification_reports(y_pred, y_val, title='Logistic Regression Model Classification Reports')

In [None]:
plot_model_performance(parameters=C,
                       title='Training and Testing Model Performances',
                       perfs=perfs,
                       x_labels='Logistic Regression Model')

## Random Forest Classifier
Random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. For classification tasks, the output of the random forest is the class selected by most trees.

In [None]:
n_estimators = [100, 500, 1000]
fig,axes = plt.subplots(1,3,figsize = (15,5)) #create figure
perfs = []
predictions = []

for i,(n,ax) in enumerate(zip(n_estimators,axes)):

    #create and train the model
    model = RandomForestClassifier(n_estimators=n)
    model.fit(X_train,y_train)

    #create predictions of testing data and store these predictions
    y_pred = model.predict(X_val)
    predictions.append(y_pred)

    #get training testing accuracy scores and store in a list
    train_score = model.score(X_train,y_train)
    val_score =  model.score(X_val,y_val) 

    perfs.append((train_score,val_score))
    plot_confusion_matrix(y_val, y_pred, train_score, val_score, hp_name=f'n_estimatrs = {n}', ax=ax, i=i)

plt.suptitle(f"Confusion Matrices (Random Forest Classifier)")
plt.tight_layout();

In [None]:
# Predict using Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500) # Initialize Random Forest with the hyperparameters from the best performing model
y_pred = rf.fit(X_train, y_train).predict(X_val)

rf_results = graph_classification_reports(y_pred, y_val, title='Logistic Regression Model Classification Reports')

In [None]:
plot_model_performance(parameters=n_estimators,
                       title='Training and Testing Model Performances',
                       perfs=perfs,
                       x_labels='Random Forest Model')

## XGBoost Classifier
XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems.

In [None]:
n_estimators_xgb = [50, 100, 150]
fig,axes = plt.subplots(1,3,figsize = (15,5)) #create figure
perfs = []
predictions = []

for i,(n,ax) in enumerate(zip(n_estimators_xgb,axes)):

    #create and train the model
    model = xgb.XGBClassifier(n_estimators=n)
    model.fit(X_train,y_train)

    #create predictions of testing data and store these predictions
    y_pred = model.predict(X_val)
    predictions.append(y_pred)

    #get training testing accuracy scores and store in a list
    train_score = model.score(X_train,y_train)
    val_score =  model.score(X_val,y_val) 

    perfs.append((train_score,val_score))
    plot_confusion_matrix(y_val, y_pred, train_score, val_score, hp_name=f'n_estimators = {n}', ax=ax, i=i)

plt.suptitle(f"Confusion Matrices (XGBoost Classifier)")
plt.tight_layout();

In [None]:
# Predict using XGBoost Classifier
xgboost = xgb.XGBClassifier() # Initialize Random Forest with the hyperparameters from the best performing model
y_pred = xgboost.fit(X_train, y_train).predict(X_val)

xgb_results = graph_classification_reports(y_pred, y_val, title='Logistic Regression Model Classification Reports')

In [None]:
plot_model_performance(parameters=n_estimators_xgb,
                       title='Training and Testing Model Performances',
                       perfs=perfs,
                       x_labels='XGBoost Classifier Model')

In [None]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"K-Nearest Neighbors": knn_results,
                                  "Logistic Regression": log_reg_results,
                                  "Random Forest Classifier": rf_results,
                                  "XGBoost Classifier": xgb_results})
all_model_results = all_model_results.T
all_model_results

In [None]:
# Reduce the accuracy to the same scale as the other metrics 
all_model_results["accuracy"] = all_model_results["accuracy"]/100
all_model_results

In [None]:
# Plot and compare all of the model results
sns.set(style = 'whitegrid',
            rc = {'figure.figsize': (20, 10)})
sns.set_palette('Reds_r')
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))

plt.xlabel('\nModels', fontsize=15, fontweight='bold')
plt.ylabel('Metric Values', fontsize=15, fontweight='bold');