In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, classification_report, confusion_matrix

import sklearn
import seaborn as sns
import pandas as pd
import numpy as np


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('C:/datasets/spaceship-titanic/train.csv')
test = pd.read_csv('C:/datasets/spaceship-titanic/test.csv')

In [3]:
target = train['Transported']
train = train.drop(['PassengerId','Name','Transported'], axis=1)
test = test.drop(['PassengerId','Name'], axis=1)

In [4]:
cat_columns = train.dtypes[(train.dtypes == "object") == True].index.tolist()

for col in cat_columns:
    if col != 'Cabin':
        train[col] = train[col].fillna(f'None{col}')
        
train['Cabin'] = train['Cabin'].fillna(train['Cabin'].mode()[0])

num_columns = train.dtypes[(train.dtypes != "object") == True].index.tolist()

for col in num_columns:
    train[col] = train[col].fillna(train[col].median())

In [5]:
cat_columns = test.dtypes[(test.dtypes == "object") == True].index.tolist()

for col in cat_columns:
    if col != 'Cabin':
        test[col] = test[col].fillna(f'None{col}')
        
test['Cabin'] = test['Cabin'].fillna(test['Cabin'].mode()[0])

num_columns = test.dtypes[(test.dtypes != "object") == True].index.tolist()

for col in num_columns:
    test[col] = test[col].fillna(test[col].median())

In [6]:
target[target == False] = 0
target[target == True] = 1

In [7]:
train = pd.concat((train, pd.get_dummies(train[['HomePlanet', 'CryoSleep', 'Destination', 'VIP']])), axis=1)
train = train.drop(['HomePlanet', 'CryoSleep', 'Destination', 'VIP'],axis=1)
test = pd.concat((test, pd.get_dummies(test[['HomePlanet', 'CryoSleep', 'Destination', 'VIP']])), axis=1)
test = test.drop(['HomePlanet', 'CryoSleep', 'Destination', 'VIP'],axis=1)

In [8]:
train = train.drop(train['Cabin'][train['Cabin'] == 'NoneCabin'].index, axis=0)
test = test.drop(test['Cabin'][test['Cabin'] == 'NoneCabin'].index, axis=0)

In [9]:
first, sec, thrd = [], [], []
mem = []
for i in range(train['Cabin'].shape[0]):
    mem = train['Cabin'].iloc[i].split('/')
    first.append(mem[0])
    sec.append(mem[1])
    thrd.append(mem[2])
first = pd.Series(first)
first.index = train['Cabin'].index
sec = pd.Series(sec)
sec.index = train['Cabin'].index
thrd = pd.Series(thrd) 
thrd.index = train['Cabin'].index
train['CabinClass'] = first
train['CabinNumber'] = sec
train['CabinSeat'] = thrd

In [10]:
first, sec, thrd = [], [], []
mem = []
for i in range(test['Cabin'].shape[0]):
    mem = test['Cabin'].iloc[i].split('/')
    first.append(mem[0])
    sec.append(mem[1])
    thrd.append(mem[2])
first = pd.Series(first)
first.index = test['Cabin'].index
sec = pd.Series(sec)
sec.index = test['Cabin'].index
thrd = pd.Series(thrd) 
thrd.index = test['Cabin'].index
test['CabinClass'] = first
test['CabinNumber'] = sec
test['CabinSeat'] = thrd

In [11]:
train = train.drop('Cabin',axis=1)
test = test.drop('Cabin',axis=1)

In [12]:
test.dtypes[(test.dtypes == "object") == True].index.tolist()

['CabinClass', 'CabinNumber', 'CabinSeat']

In [13]:
test.dtypes[(test.dtypes != "object") == True].index.tolist() == train.dtypes[(train.dtypes != "object") == True].index.tolist()

True

In [14]:
train = pd.concat((train, pd.get_dummies(train['CabinClass'])), axis=1)
train = pd.concat((train, pd.get_dummies(train['CabinSeat'])), axis=1)
train = train.drop(['CabinClass','CabinSeat'],axis=1)

In [15]:
test = pd.concat((test, pd.get_dummies(test['CabinClass'])), axis=1)
test = pd.concat((test, pd.get_dummies(test['CabinSeat'])), axis=1)
test = test.drop(['CabinClass','CabinSeat'],axis=1)

In [16]:
train.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_NoneHomePlanet', 'CryoSleep_False', 'CryoSleep_True',
       'CryoSleep_NoneCryoSleep', 'Destination_55 Cancri e',
       'Destination_NoneDestination', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'VIP_NoneVIP',
       'CabinNumber', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'P', 'S'],
      dtype='object')

In [17]:
test.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_NoneHomePlanet', 'CryoSleep_False', 'CryoSleep_True',
       'CryoSleep_NoneCryoSleep', 'Destination_55 Cancri e',
       'Destination_NoneDestination', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'VIP_NoneVIP',
       'CabinNumber', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'P', 'S'],
      dtype='object')

In [18]:
pd.set_option('display.max_columns', 35)

In [19]:
train = train.astype('float64')
test = test.astype('float64')
target = target.astype('float64')

In [172]:
lst_for_scale = []
lst_not_fot_scale = []
for col in train.columns:
    if len(train[col].value_counts()) != 2:
        lst_for_scale.append(col)
    elif len(train[col].value_counts()) == 2:
        lst_not_fot_scale.append(col)

In [175]:
train[lst_not_fot_scale] = train[lst_not_fot_scale].replace(0, 0.1)
train[lst_not_fot_scale] = train[lst_not_fot_scale].replace(1, 0.9)

In [177]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = scaler.fit_transform(train[lst_for_scale])

scaled_test = scaler.transform(test[lst_for_scale])

In [178]:
scaled_train = pd.DataFrame(scaled_train)

for i in range(0,7):
    scaled_train = scaled_train.rename(columns={i:scaler.feature_names_in_[i]})

copy_train = train.drop(scaler.feature_names_in_,axis=1).copy()
train_final = pd.concat((scaled_train, copy_train), axis=1)

In [179]:
scaled_test = pd.DataFrame(scaled_test)

for i in range(0,7):
    scaled_test = scaled_test.rename(columns={i:scaler.feature_names_in_[i]})

copy_test = test.drop(scaler.feature_names_in_,axis=1).copy()

test_final = pd.concat((scaled_test, copy_test), axis=1)

In [180]:
# for col in train.columns:
#     sns.distplot(train[col])
#     plt.show()
#     sns.distplot(train_final[col])
#     plt.show()
#     print('===========================')
train_final.Age.describe()

count    8.693000e+03
mean    -2.947653e-17
std      1.000058e+00
min     -2.007610e+00
25%     -6.129662e-01
50%     -1.248409e-01
75%      5.724810e-01
max      3.501233e+00
Name: Age, dtype: float64

In [241]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
kfold= KFold(n_splits=10,random_state=42,shuffle=True) #kfold cross validation, 10-subsets
X_train, X_test, y_train, y_test = train_test_split(train_final, target, test_size=0.26377,random_state=17)

In [242]:
X_train = torch.tensor(X_train.to_numpy()).float().cuda(); X_train.shape

torch.Size([6400, 31])

In [243]:
y_train = torch.tensor(y_train.to_numpy()).view(-1,1).float().cuda(); y_train.shape

torch.Size([6400, 1])

In [244]:
X_test = torch.tensor(X_test.to_numpy()).float().cuda(); X_test.shape

torch.Size([2293, 31])

In [245]:
y_test = torch.tensor(y_test.to_numpy()).view(-1,1).float().cuda(); y_test.shape

torch.Size([2293, 1])

In [283]:
model = nn.Sequential(nn.Linear(31,512),
                      nn.ReLU(),
                      nn.Linear(512,256),
                      nn.ReLU(),
                      nn.Linear(256,128),
                      nn.ReLU(),
                      nn.Linear(128,64),
                      nn.ReLU(),
                      nn.Linear(64,1),
                      nn.Sigmoid()).to('cuda')

In [290]:
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)

In [289]:
# TEST ACCURACY
probs = model(X_test).cpu().data.numpy()
fpr, tpr, thr = roc_curve(y_test.cpu(), probs)
roc = pd.DataFrame({'fpr':fpr, 'tpr':tpr, 'thr':thr});
roc['random'] = roc['fpr'].copy()
roc['diff'] = roc['tpr'] - roc['fpr']
roc = roc.sort_values('diff', ascending=False).reset_index().drop('index', axis=1)
threshold = roc['thr'].loc[0]
preds_chosen = (probs >= threshold).astype('int64')
preds_auto = (probs >= 0.5).astype('int64')
accuracy_chosen = accuracy_score(y_test.cpu(), preds_chosen)
accuracy_auto = accuracy_score(y_test.cpu(), preds_auto)

if accuracy_chosen > accuracy_auto:
    preds = preds_chosen
elif accuracy_auto > accuracy_chosen:
    preds = preds_auto
    
accuracy_ = accuracy_score(y_test.cpu(), preds)
print(f'accuracy: {accuracy_}') 

accuracy: 0.8172699520279111


In [291]:
accuracy_min = 0.8172699520279111
epochs = 500
for e in range(epochs):    
    model.train()
    counter = 0
    for i in range(0,100):
        counter += 64
        X_batch = X_train[counter-64:counter]
        y_batch = y_train[counter-64:counter]
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    loss = criterion(model(X_test), y_test)
    probs = model(X_test).cpu().data.numpy()
    fpr, tpr, thr = roc_curve(y_test.cpu(), probs)
    roc = pd.DataFrame({'fpr':fpr, 'tpr':tpr, 'thr':thr});
    roc['random'] = roc['fpr'].copy()
    roc['diff'] = roc['tpr'] - roc['fpr']
    roc = roc.sort_values('diff', ascending=False).reset_index().drop('index', axis=1)
    threshold = roc['thr'].loc[0]
    preds_chosen = (probs >= threshold).astype('int64')
    preds_auto = (probs >= 0.5).astype('int64')
    accuracy_chosen = accuracy_score(y_test.cpu(), preds_chosen)
    accuracy_auto = accuracy_score(y_test.cpu(), preds_auto)

    if accuracy_chosen > accuracy_auto:
        preds = preds_chosen
    elif accuracy_auto > accuracy_chosen:
        preds = preds_auto

    accuracy_ = accuracy_score(y_test.cpu(), preds)
  
    print(f"test loss of {e}th epoch number: {loss}, accuracy_: {accuracy_}")
    if accuracy_ >= accuracy_min:
        print('f1 score increased ({:.6f} --> {:.6f}).  Saving model ...'.format(accuracy_min, accuracy_))
        torch.save(model.state_dict(), 'model_space.pt')
        accuracy_min = accuracy_
    

test loss of 0th epoch number: 0.3972495496273041, accuracy_: 0.8146532926297427
test loss of 1th epoch number: 0.39735573530197144, accuracy_: 0.8137810728303533
test loss of 2th epoch number: 0.3975241184234619, accuracy_: 0.8137810728303533
test loss of 3th epoch number: 0.3977000117301941, accuracy_: 0.8137810728303533
test loss of 4th epoch number: 0.39782851934432983, accuracy_: 0.8146532926297427
test loss of 5th epoch number: 0.3979710042476654, accuracy_: 0.8146532926297427
test loss of 6th epoch number: 0.3981323540210724, accuracy_: 0.814217182730048
test loss of 7th epoch number: 0.3982450067996979, accuracy_: 0.8146532926297427
test loss of 8th epoch number: 0.3983105719089508, accuracy_: 0.814217182730048
test loss of 9th epoch number: 0.39846500754356384, accuracy_: 0.8137810728303533
test loss of 10th epoch number: 0.3985578119754791, accuracy_: 0.814217182730048
test loss of 11th epoch number: 0.3986853063106537, accuracy_: 0.814217182730048
test loss of 12th epoch num

test loss of 100th epoch number: 0.4201448857784271, accuracy_: 0.812472743131269
test loss of 101th epoch number: 0.4206503629684448, accuracy_: 0.8120366332315744
test loss of 102th epoch number: 0.421110063791275, accuracy_: 0.8116005233318796
test loss of 103th epoch number: 0.4217349886894226, accuracy_: 0.8116005233318796
test loss of 104th epoch number: 0.4220866858959198, accuracy_: 0.812472743131269
test loss of 105th epoch number: 0.4225769340991974, accuracy_: 0.8120366332315744
test loss of 106th epoch number: 0.42305994033813477, accuracy_: 0.8116005233318796
test loss of 107th epoch number: 0.42341363430023193, accuracy_: 0.8120366332315744
test loss of 108th epoch number: 0.42401713132858276, accuracy_: 0.8116005233318796
test loss of 109th epoch number: 0.42445844411849976, accuracy_: 0.8116005233318796
test loss of 110th epoch number: 0.42488646507263184, accuracy_: 0.8107283035324901
test loss of 111th epoch number: 0.4252952039241791, accuracy_: 0.8111644134321849
te

KeyboardInterrupt: 

In [287]:
# LOAD MODEL
model.load_state_dict(torch.load('model_space.pt'))

<All keys matched successfully>

In [288]:
# TEST ACCURACY
probs = model(X_test).cpu().data.numpy()
fpr, tpr, thr = roc_curve(y_test.cpu(), probs)
roc = pd.DataFrame({'fpr':fpr, 'tpr':tpr, 'thr':thr});
roc['random'] = roc['fpr'].copy()
roc['diff'] = roc['tpr'] - roc['fpr']
roc = roc.sort_values('diff', ascending=False).reset_index().drop('index', axis=1)
threshold = roc['thr'].loc[0]
preds_chosen = (probs >= threshold).astype('int64')
preds_auto = (probs >= 0.5).astype('int64')
accuracy_chosen = accuracy_score(y_test.cpu(), preds_chosen)
accuracy_auto = accuracy_score(y_test.cpu(), preds_auto)

if accuracy_chosen > accuracy_auto:
    preds = preds_chosen
elif accuracy_auto > accuracy_chosen:
    preds = preds_auto
    
accuracy_ = accuracy_score(y_test.cpu(), preds)
print(f'accuracy: {accuracy_}') 

accuracy: 0.8172699520279111


In [267]:
probs = model(torch.tensor(test_final.to_numpy()).float().cuda()).cpu().data.numpy()

In [268]:
probs, threshold

(array([[0.5688352 ],
        [0.03965732],
        [0.99987364],
        ...,
        [0.9931312 ],
        [0.9558919 ],
        [0.6411717 ]], dtype=float32),
 0.4192648)

In [269]:
predictions = (probs >= threshold).astype('int64')

In [270]:
predictions.squeeze(1)

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [271]:
submission['Transported'] = predictions.squeeze(1)
submission['Transported'] = submission['Transported'].replace(0, False)
submission['Transported'] = submission['Transported'].replace(1, True)
submission.to_csv('C:/datasets/spaceship-titanic/torch_spaceship.csv', index=False)

pd.read_csv('C:/datasets/spaceship-titanic/torch_spaceship.csv')

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
