In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sn
import math
import cv2
import os
import PIL
import pathlib

from tensorflow import keras
from sklearn.metrics import confusion_matrix , classification_report
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
#Importing training, test, sample submission datasets
df_train = pd.read_csv('train_titanic_spaceship.csv')
df_test = pd.read_csv('test_titanic_spaceship.csv')
df_sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
#Reading training dataset
df_train.sample(15)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8459,9034_03,Europa,False,D/288/P,55 Cancri e,27.0,False,0.0,984.0,0.0,13995.0,312.0,Phecca Headmish,False
1734,1847_01,Earth,False,G/290/S,TRAPPIST-1e,65.0,False,5.0,317.0,,0.0,123.0,Jaimez Tuckers,False
4128,4403_02,Earth,False,G/717/S,,41.0,False,7.0,0.0,297.0,108.0,361.0,Kylen Peter,False
8533,9111_01,Mars,False,E/598/S,TRAPPIST-1e,38.0,False,1478.0,0.0,631.0,0.0,0.0,Chars Gecre,False
2747,2949_01,Earth,True,G/482/P,TRAPPIST-1e,22.0,False,0.0,0.0,0.0,,0.0,Florey Boltertley,False
6886,7290_04,Europa,False,D/227/P,TRAPPIST-1e,36.0,False,83.0,1186.0,0.0,2160.0,736.0,Altara Bruthydre,False
7110,7578_01,Earth,False,F/1572/P,55 Cancri e,14.0,False,653.0,0.0,4.0,0.0,0.0,Idace Popelacruz,False
2567,2753_01,Earth,False,E/188/S,PSO J318.5-22,39.0,False,0.0,0.0,0.0,0.0,907.0,Wandy Summington,False
517,0553_02,Europa,False,C/19/P,TRAPPIST-1e,37.0,False,0.0,6506.0,0.0,764.0,0.0,Gluinol Valindle,True
7263,7768_01,Earth,False,F/1484/S,TRAPPIST-1e,14.0,False,0.0,268.0,239.0,127.0,0.0,Vivia Ocherman,True


In [4]:
#Reading test datasets
df_test.sample(15)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
2275,4953_01,Europa,True,,TRAPPIST-1e,49.0,False,0.0,0.0,0.0,0.0,0.0,Solam Slable
3127,6849_01,Europa,False,D/207/S,TRAPPIST-1e,21.0,False,0.0,633.0,0.0,5865.0,145.0,Sargin Slavested
809,1654_01,Earth,False,E/121/S,TRAPPIST-1e,17.0,False,980.0,0.0,7.0,0.0,0.0,Clarry Bardinard
2194,4747_01,Europa,True,D/148/S,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Mesarga Ingascomet
2709,5961_01,Europa,True,B/204/P,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Avijava Extrazy
2614,5731_02,Mars,False,D/181/P,TRAPPIST-1e,23.0,False,60.0,0.0,946.0,0.0,0.0,Cor Lake
3729,8132_01,Earth,False,E/538/S,55 Cancri e,49.0,False,2.0,0.0,0.0,263.0,2219.0,Dondy Meyersones
2476,5382_01,Europa,True,C/211/S,55 Cancri e,34.0,False,0.0,,0.0,0.0,0.0,Tabdhib Cralinal
3258,7135_02,Earth,False,F/1369/S,55 Cancri e,17.0,False,0.0,0.0,799.0,2.0,0.0,Stace Gainebergan
1831,3892_02,Earth,False,F/802/P,TRAPPIST-1e,38.0,False,1542.0,0.0,9.0,0.0,1.0,Milda Gletonerson


In [5]:
#Checking for all unique values in the column
df_train['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [6]:
#Checking for all unique values in the column
df_train['Destination'].unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [7]:
#Checking the data type
df_train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [8]:
#Checking for number of NaN values
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [9]:
df_train.shape

(8693, 14)

In [10]:
#Dropping all the rows with NaN
df_train = df_train.dropna()
df_train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [11]:
df_train.shape

(6606, 14)

In [12]:
df_test.shape

(4277, 13)

In [13]:
df_train.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7628,8147_01,Earth,False,G/1317/P,TRAPPIST-1e,20.0,False,0.0,142.0,0.0,3528.0,92.0,Everly Pittler,False
4885,5207_02,Earth,False,F/998/S,TRAPPIST-1e,31.0,False,1364.0,0.0,0.0,0.0,21.0,Rossie Chanan,False
5599,5960_01,Europa,False,C/222/S,TRAPPIST-1e,28.0,False,0.0,4780.0,1761.0,1.0,247.0,Eltares Dister,True
725,0768_01,Europa,False,B/32/P,55 Cancri e,36.0,False,29.0,12180.0,0.0,93.0,17306.0,Maiam Oilloody,False
2840,3064_01,Earth,True,G/482/S,TRAPPIST-1e,21.0,False,0.0,0.0,0.0,0.0,0.0,Thewis Spencervan,True


In [14]:
df_train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [15]:
#Mapping objects to int
df_train['HomePlanet'] = df_train['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
df_train['Destination'] = df_train['Destination'].map({'TRAPPIST-1e': 0, 'PSO J318.5-22': 1, '55 Cancri e': 2})
df_test['HomePlanet'] = df_test['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
df_test['Destination'] = df_test['Destination'].map({'TRAPPIST-1e': 0, 'PSO J318.5-22': 1, '55 Cancri e': 2})

In [16]:
#Converting all Trues to 1 and all Falses to 0
for column in df_train.columns:
    df_train[column] = df_train[column].replace({True: 1, False: 0})
for column in df_train.columns:
    df_train[column] = df_train[column].replace({True: 1, False: 0})
    
df_train.dtypes

PassengerId      object
HomePlanet        int64
CryoSleep         int64
Cabin            object
Destination       int64
Age             float64
VIP               int64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported       int64
dtype: object

In [17]:
df_train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [18]:
df_train.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
1408,1489_01,0,0,G/229/S,0,19.0,0,73.0,0.0,0.0,813.0,2.0,Katiey Gallencis,0
6346,6713_01,0,0,F/1395/P,2,24.0,0,700.0,0.0,0.0,0.0,0.0,Raulia Sweene,0
5209,5556_01,0,0,E/361/S,0,27.0,0,0.0,1.0,667.0,897.0,43.0,Jona Hanner,0
2832,3060_02,2,0,D/98/S,0,45.0,0,3309.0,0.0,10.0,0.0,0.0,Duckes Chman,0
4217,4493_01,0,0,F/845/S,0,26.0,0,42.0,65.0,895.0,407.0,0.0,Sus Coolez,1


In [19]:
#Correlation checking among the columns
a = df_train['Transported']
b = df_train['Transported']
r = np.corrcoef(a, b)[0, 1]
r

1.0

In [20]:
#Scaling the useful features and defining the training and test data
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'RoomService', 'Spa', 'VRDeck' ]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.fit_transform(df_test[features])

x_train = df_train[features]
y_train = df_train['Transported']
x_test = df_test[features]

x_train.sample(10)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,RoomService,Spa,VRDeck
4532,1.0,1.0,0.0,0.253165,0.0,0.0,0.0
7829,0.5,1.0,0.0,0.21519,0.0,0.0,0.0
2656,0.0,0.0,1.0,0.556962,0.067742,0.0,0.000983
498,0.0,0.0,0.0,0.35443,0.0,0.0,0.000246
7850,0.0,0.0,0.0,0.164557,0.0,0.125402,0.000443
1501,0.0,0.0,0.0,0.291139,0.000504,0.032667,0.0
7489,1.0,1.0,0.0,0.35443,0.0,0.0,0.0
2512,0.5,0.0,0.0,0.632911,0.0,0.006828,0.086103
3764,1.0,0.0,0.0,0.405063,0.185786,0.0,0.0
3531,1.0,0.0,0.0,0.088608,0.0,0.0,0.0


In [21]:
x_train.shape

(6606, 7)

In [22]:
#Defining the ANN Model
def ANN(x_train, y_train, x_test, loss, weights):
    model = keras.Sequential([
        keras.layers.Dense(500,input_shape=(7,), activation='relu'),
        keras.layers.Dense(15, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid') #SIGMOID FOR BINARY CLASSIFICATION AND SOFTMAX FOR MULTICLASS CLASSIFICATION
    ])

    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    
    if weights == -1:
        model.fit(x_train, y_train, epochs=100)
    else:
        model.fit(x_train, y_train, epochs=100, class_weight = weights)
    
    y_preds = model.predict(x_test)
    y_preds = np.round(y_preds)
    
    return y_preds

In [23]:
y_preds = ANN(x_train, y_train, x_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [24]:
#Converting the float to bool
def float_to_int(float_number):
    return int(float_number)
y_preds_int = list(map(float_to_int, y_preds))
def int_to_bool(int_number):
    return bool(int_number)
y_preds_bool = list(map(int_to_bool, y_preds_int))
y_preds_bool

[True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 

In [25]:
#Getting the output csv
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Transported': y_preds_bool})
output.to_csv('spaceship_titanic_prediction.csv', index=False)

In [26]:
#Importing and reading the csv
df = pd.read_csv('spaceship_titanic_prediction.csv')
df.sample(25)

Unnamed: 0,PassengerId,Transported
1030,2160_01,False
2988,6559_02,False
3961,8630_01,True
3909,8531_01,True
1857,3953_01,True
2427,5309_01,True
566,1172_01,True
3946,8578_01,False
246,0530_01,False
2850,6278_02,True


In [27]:
df.shape

(4277, 2)