# Spaceship Titanic Survival Prediction

## ToDo
- [x] Prepare Notebook
- [x] Exploratory Data Analysis
- [ ] Data Preprocessing
- [ ] Training Model
- [ ] Evaluating Model

### Prepare Notebook

In [None]:
# Import Pacakages

import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.color_palette("pastel")
plt.style.use(["seaborn-darkgrid"])
plt.rcParams["figure.figsize"] = (12, 9)
plt.rcParams["font.family"] = "DejaVu Sans"

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

from core import downloadDataset

In [None]:
# Download Dataset

DATA_URL = 'https://www.kaggle.com/competitions/spaceship-titanic/data'

downloadDataset(url = DATA_URL, datasetName = 'spaceship-titanic')

In [None]:
# Load Dataset

train_df = pd.read_csv('./dataset/spaceship-titanic/train.csv', index_col = 'PassengerId')
test_df = pd.read_csv('./dataset/spaceship-titanic/test.csv', index_col = 'PassengerId')
sample_df = pd.read_csv('./dataset/spaceship-titanic/sample_submission.csv')

### Exploratory Data Analysis

In [None]:
train_df.head(3)

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
sns.heatmap(train_df.corr().sort_values(by = 'Transported', ascending = False), annot = True)
plt.title("Correlation Map")
plt.show()

`FoodCourt` correlates more than any other feature

In [None]:
sns.histplot(
    data = train_df,
    x = 'HomePlanet',
    hue = 'Transported',
    legend = True,
    palette = "flare"
)
plt.show()

In [None]:
train_df.Transported.value_counts()[1] / train_df.shape[0]

In [None]:
train_df.HomePlanet.value_counts()[1] / train_df.shape[0]

In [None]:
train_df.HomePlanet.loc[train_df.Transported == True].value_counts()[1] / train_df.shape[0]

Only `16 %` from Earth Survived

In [None]:
sns.histplot(train_df.Age, bins = 80)
plt.title("Histogram of Age Feature")
plt.show()

In [None]:
train_df.VIP.loc[train_df.Transported == True].value_counts()

Even `VIP` are not surviving


### Data Preprocessing

In [None]:
# Encoding Boolean values

def changeBool(df):
    cols = list(df.columns)
    df['CryoSleep'] = df['CryoSleep'].map({True : 1, False: 2})
    df['VIP'] = df['VIP'].map({True : 1, False: 2})
    if 'Transported' in cols:
        df['Transported'] = df['Transported'].map({True : 1, False: 2})
    return df

train_df = changeBool(train_df)
test_df = changeBool(test_df)

In [None]:
# Dropping columns

train_df = train_df.drop(['Cabin', 'Name'], axis = 1)
test_df = test_df.drop(['Cabin', 'Name'], axis = 1)

In [None]:
def encodeValues(df):
    df['HomePlanet'] = df['HomePlanet'].map({'Europa' : 0, 'Earth' : 1, 'Mars' : 2})
    df['Destination'] = df['Destination'].map({'TRAPPIST-1e' : 0, '55 Cancri e' : 1, 'PSO J318.5-22' : 2})
    return df

# train_df = encodeValues(train_df)
# test_df = encodeValues(test_df)

In [None]:
# # Imputing Null Values

# imputer = SimpleImputer(strategy = 'mean')
# train_df = imputer.fit_transform(train_df)
# train_df.isna().sum()