In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Spaceship Titanic Passenger Prediction

## Steps
* Problem Definition
* Exploratory Data Analysis
* Data Preprocessing
* Model Building
* Predictions & Submission

### Problem Definition
* Task is to build a model to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

### Exploratory Data Analysis

In [None]:
# Reading training & testing data
passenger_train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
passenger_test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
# Passenger train data sample
passenger_train_data.sample(5)

* Cabin to be separated
* Age, RoomService, FoodCourt, ShoppingMall, VRDeck and Spa to be converted to numeric
* Other columns to be encoded with Ordinal Encoder/Label Encoder

In [None]:
# Splitting columns for any correlation 
def split_columns(df,delim):
    return df.str.split(delim, n =-1, expand = True)

cabin_columns = ['Deck','Deck Number','Side']
id_columns = ['Passenger Group','Passenger Number']

split_columns_cabin_train = split_columns(passenger_train_data['Cabin'],'/')
split_columns_id_train = split_columns(passenger_train_data['PassengerId'],'_')
split_columns_cabin_train.columns = cabin_columns
split_columns_id_train.columns = id_columns

split_columns_cabin_test = split_columns(passenger_test_data['Cabin'],'/')
split_columns_id_test = split_columns(passenger_test_data['PassengerId'],'_')
split_columns_cabin_test.columns = cabin_columns
split_columns_id_test.columns = id_columns

In [None]:
passenger_train_data = passenger_train_data.drop(['PassengerId','Cabin'],axis=1)
passenger_test_data = passenger_test_data.drop(['PassengerId','Cabin'],axis=1)

passenger_train_data = pd.concat([passenger_train_data,split_columns_cabin_train,split_columns_id_train],axis=1)
passenger_test_data = pd.concat([passenger_test_data,split_columns_cabin_test,split_columns_id_test],axis=1)

In [None]:
# Missing data in training data
passenger_train_data.isna().sum()

In [None]:
# Missing data in test data
passenger_test_data.isna().sum()

* There are missing values in training and test data

In [None]:
# imputing missing data with mode value
from sklearn.impute import SimpleImputer

In [None]:
# Train & test data columns
columns_train = passenger_train_data.columns
columns_test = passenger_test_data.columns

In [None]:
# Imputing values
imputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')

passenger_train_data = imputer.fit_transform(passenger_train_data)
passenger_test_data = imputer.fit_transform(passenger_test_data)

passenger_train_data = pd.DataFrame(passenger_train_data,columns = columns_train)
passenger_test_data = pd.DataFrame(passenger_test_data,columns = columns_test)

In [None]:
# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Basic Info
passenger_train_data.info()

In [None]:
# Age Histogram/distribution
plt.figure(figsize = (10,8))
sns.histplot(data = passenger_train_data, x = 'Age',kde=True)
plt.title("Histogram of Age")
plt.show()

* Most of the people are in range 20-30 years age

In [None]:
# Source planet of travellers

plt.figure(figsize = (8,6))
ax = sns.countplot(data = passenger_train_data, x = 'HomePlanet')
ax.bar_label(ax.containers[0])
plt.title("Number of Travellers based on Home Planet")
plt.show()

In [None]:
# Destination Planet of travellers

plt.figure(figsize = (8,6))
ax = sns.countplot(data = passenger_train_data, x = 'Destination')
ax.bar_label(ax.containers[0])
plt.title("Number of Travellers based on Destination")
plt.show()

In [None]:
# VIP Count

plt.figure(figsize = (8,6))
ax = sns.countplot(data = passenger_train_data, x = 'VIP')
ax.bar_label(ax.containers[0])
plt.title("VIP Count")
plt.show()

In [None]:
# Number of Travellers Transported

plt.figure(figsize = (12,8))
plt.subplot(122)
ax = sns.countplot(data = passenger_train_data, x = 'Transported',hue = 'HomePlanet')
for i in range(len(ax.containers)):
    ax.bar_label(ax.containers[i])

travelled_count = passenger_train_data['Transported'].value_counts()
plt.subplot(121)
plt.pie(travelled_count,autopct='%.2f',labels=travelled_count.index)

plt.suptitle("Number of Tarevellers Transported")
plt.show()

In [None]:
# Travellers data according to assignment of cabin (Deck)
plt.figure(figsize = (8,6))
ax = sns.countplot(data = passenger_train_data, x = 'Deck',hue='Transported')
for i in range(len(ax.containers)):
    ax.bar_label(ax.containers[i])
plt.title("Deck")
plt.show()

### Data Preprocessing

In [None]:
# importing encoders
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder

In [None]:
# Separating categorical and numercial columns
num_columns = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Deck Number','Passenger Group','Passenger Number']
cat_columns = ['HomePlanet','CryoSleep','Destination','Deck','Side']

In [None]:
# Converting to numeric
for col in num_columns:
    passenger_train_data[col]=pd.to_numeric(passenger_train_data[col])

In [None]:
# Encoding categorical data
encoder_train = OrdinalEncoder().fit_transform(passenger_train_data[cat_columns])
encoder_train = pd.DataFrame(encoder_train,columns = cat_columns)

encoder_test = OrdinalEncoder().fit_transform(passenger_test_data[cat_columns])
encoder_test = pd.DataFrame(encoder_test,columns = cat_columns)

In [None]:
# Final dataframes
X_tr = pd.concat([passenger_train_data[num_columns],encoder_train],axis=1)
X_test = pd.concat([passenger_test_data[num_columns],encoder_test],axis=1)

y_tr = passenger_train_data['Transported']
label_encoder = LabelEncoder().fit_transform(y_tr)
y_tr = pd.DataFrame(label_encoder,columns = ['Transported'])

### Model Building

In [None]:
# Decision Tree classifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,f1_score
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
# Splitting data for train and validation
X_train,X_valid,y_train,y_valid = train_test_split(X_tr,y_tr, test_size=0.25,random_state = 42)

In [None]:
# Decision Tree Classifier
model_dt = DecisionTreeClassifier(max_depth=10)
model_dt.fit(X_train,y_train)
y_pred = model_dt.predict(X_valid)

print("Accuracy Score: ",accuracy_score(y_valid,y_pred))
print("F1 Score: ",f1_score(y_valid,y_pred))
print("Confusion Matrix")

cm = confusion_matrix(y_valid,y_pred,labels = model_dt.classes_)
display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_dt.classes_)
display.plot()
plt.show()

In [None]:
# Random Forest Classifier
model_rf = RandomForestClassifier(max_depth=10,random_state=42)
model_rf.fit(X_train,y_train)
y_pred = model_rf.predict(X_valid)

print("Accuracy Score: ",accuracy_score(y_valid,y_pred))
print("F1 Score: ",f1_score(y_valid,y_pred))
print("Confusion Matrix")

cm = confusion_matrix(y_valid,y_pred,labels = model_rf.classes_)
display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_rf.classes_)
display.plot()
plt.show()

In [None]:
# XGBoost Classifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)
y_pred = model_xgb.predict(X_valid)

print("Accuracy Score: ",accuracy_score(y_valid,y_pred))
print("F1 Score: ",f1_score(y_valid,y_pred))
print("Confusion Matrix")

cm = confusion_matrix(y_valid,y_pred,labels = model_xgb.classes_)
display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_xgb.classes_)
display.plot()
plt.show()

In [None]:
# Suppoprt Vector Calssifier
model_sv = SVC(random_state=42)
model_sv.fit(X_train,y_train)
y_pred = model_sv.predict(X_valid)

print("Accuracy Score: ",accuracy_score(y_valid,y_pred))
print("F1 Score: ",f1_score(y_valid,y_pred))
print("Confusion Matrix")

cm = confusion_matrix(y_valid,y_pred,labels = model_sv.classes_)
display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_sv.classes_)
display.plot()
plt.show()

### Predictions and Submission

In [None]:
# Final Predictions with random forest model

y_final_pred = pd.DataFrame(model_rf.predict(X_test),columns = ['Transported'])

In [None]:
submission = X_test['Passenger Group'].astype(str) + '_' + X_test['Passenger Number'].astype(str)
submission = pd.DataFrame(submission,columns = ['PassengerId'])
submission = pd.concat([submission,y_final_pred],axis=1)

submission['Transported'] = submission['Transported'].replace({0:False,1:True})
submission.set_index('PassengerId')

In [None]:
# Exporting to csv
submission.to_csv('submission.csv',index=False)