In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from torch import tensor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from datetime import datetime

In [3]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
modes = df.mode().iloc[0]
df.fillna(modes, inplace=True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [5]:
df['LogFare'] = np.log(df['Fare']+1)

In [6]:
df = pd.get_dummies(df, columns=["Sex","Pclass","Embarked"])
added_cols = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [7]:
t_dep = tensor(df.Survived)
indep_cols = ['Age', 'SibSp', 'Parch', 'LogFare'] + added_cols
df[indep_cols] = df[indep_cols].apply(lambda col: col.astype(int) if col.dtypes == 'bool' else col)
t_indep = tensor(df[indep_cols].values, dtype=torch.float)

In [8]:
trn_indep, val_indep, trn_dep, val_dep = train_test_split(
    t_indep, 
    t_dep, 
    test_size=0.2,
    random_state=42,
)

In [9]:
class titanicModel(torch.nn.Module):
    def __init__(self, input_dim):
        super(titanicModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()

        )

    def forward(self, x):
        return self.model(x)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = titanicModel(trn_indep.shape[1]).to(device)

In [11]:
loss_fn = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
def one_epoch():
    inputs = trn_indep.to(device)
    labels = trn_dep.float()[:,None].to(device)

    
    optimizer.zero_grad()
    
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()

    return loss.item()

In [13]:
def acc(labels, outputs): return (labels.bool()==(outputs>0.5)).float().mean()

In [14]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/titanicM_{}'.format(timestamp))

def train(epochs=75):
    losses = []

    for epoch in range(epochs):
        print(f'EPOCH {epoch + 1}')

        model.train()
        trnloss = one_epoch()

        losses.append(trnloss)


        model.eval()
        with torch.no_grad():
            vinputs = val_indep
            vlabels = val_dep.float()[:,None]
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)

            accuracy = acc(vlabels, voutputs)


        print(f'LOSS train {trnloss} valid {vloss} vaccuarcy {accuracy}')
        
        writer.add_scalars('Training vs. Validation Loss',
                           { 'Training' : trnloss, 'Validation' : vloss, 'Validation accuracy': accuracy },
                           epoch + 1,)
        writer.flush()
        
    return losses

In [15]:
train()

EPOCH 1
LOSS train 0.23450608551502228 valid 0.23581378161907196 vaccuarcy 0.6033519506454468
EPOCH 2
LOSS train 0.2321273237466812 valid 0.23394189774990082 vaccuarcy 0.6033519506454468
EPOCH 3
LOSS train 0.23061829805374146 valid 0.2322687953710556 vaccuarcy 0.6145251393318176
EPOCH 4
LOSS train 0.2294193059206009 valid 0.23090769350528717 vaccuarcy 0.6145251393318176
EPOCH 5
LOSS train 0.22850961983203888 valid 0.22976267337799072 vaccuarcy 0.6145251393318176
EPOCH 6
LOSS train 0.22762347757816315 valid 0.22870109975337982 vaccuarcy 0.6145251393318176
EPOCH 7
LOSS train 0.2266264706850052 valid 0.22765256464481354 vaccuarcy 0.6145251393318176
EPOCH 8
LOSS train 0.225544273853302 valid 0.22654293477535248 vaccuarcy 0.6145251393318176
EPOCH 9
LOSS train 0.22444692254066467 valid 0.2254246175289154 vaccuarcy 0.6145251393318176
EPOCH 10
LOSS train 0.22342315316200256 valid 0.22426219284534454 vaccuarcy 0.6145251393318176
EPOCH 11
LOSS train 0.22242926061153412 valid 0.2231031209230423 v

[0.23450608551502228,
 0.2321273237466812,
 0.23061829805374146,
 0.2294193059206009,
 0.22850961983203888,
 0.22762347757816315,
 0.2266264706850052,
 0.225544273853302,
 0.22444692254066467,
 0.22342315316200256,
 0.22242926061153412,
 0.22139932215213776,
 0.22032426297664642,
 0.21922087669372559,
 0.2180919051170349,
 0.21693117916584015,
 0.21571534872055054,
 0.21443377435207367,
 0.21309511363506317,
 0.21172092854976654,
 0.21034032106399536,
 0.2089727520942688,
 0.20747381448745728,
 0.20592889189720154,
 0.2043580859899521,
 0.20276230573654175,
 0.20113207399845123,
 0.19943250715732574,
 0.19767062366008759,
 0.195859894156456,
 0.19398224353790283,
 0.19202296435832977,
 0.18987543880939484,
 0.18766291439533234,
 0.18556320667266846,
 0.18363845348358154,
 0.18174010515213013,
 0.17983686923980713,
 0.17792606353759766,
 0.1760212928056717,
 0.1741214096546173,
 0.1722317337989807,
 0.17034973204135895,
 0.16846594214439392,
 0.16659320890903473,
 0.16471391916275024,
 

In [16]:
tst_df = pd.read_csv("/kaggle/input/titanic/test.csv")
tst_df['Fare'] = tst_df.Fare.fillna(0)
tst_df.fillna(modes, inplace=True)
tst_df['LogFare'] = np.log(tst_df['Fare']+1)
tst_df = pd.get_dummies(tst_df, columns=["Sex","Pclass","Embarked"])
tst_df[indep_cols] = tst_df[indep_cols].apply(lambda col: col.astype(int) if col.dtypes == 'bool' else col)

tst_indep = tensor(tst_df[indep_cols].values, dtype=torch.float)
vals,indices = t_indep.max(dim=0)
tst_indep = tst_indep / vals

In [17]:
tst_df['Survived'] = (model(tst_indep)>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('submission.csv', index=False)

In [18]:
!head submission.csv

PassengerId,Survived
892,0
893,1
894,1
895,0
896,1
897,0
898,1
899,1
900,1


  pid, fd = os.forkpty()
