In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [4]:
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare"]
X = train_data[features]
X.loc[X['Sex'] == 'male', ['Sex']] = 1
X.loc[X['Sex'] == 'female', ['Sex']] = -1
mean_age = X["Age"].mean(0)
X["Age"] = X["Age"].fillna(mean_age)
y = train_data["Survived"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Age"] = X["Age"].fillna(mean_age)


In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, list_features, labels):
        self.labels = labels
        self.features = list_features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        X = self.features[index]
        y = self.labels[index]

        return X, y

In [6]:
def normalize(arr):
    # arr - np.array
    # output - np.array
    
    mean = arr.mean()
    std = arr.std()
    
    output = arr - mean
    output /= std
    
    return output

In [25]:
class TitanicNet(nn.Module):
    def __init__(self):
        super(TitanicNet, self).__init__()
        self.fc1 = nn.Linear(len(X[0]), 1000)
        self.fc2 = nn.Linear(1000, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        output = nn.Sigmoid()(x)
        return output

In [26]:
for col in ["Pclass", "SibSp", "Parch", "Age", "Fare"]:
    X[col] = normalize(X[col].to_numpy(np.float32))

In [27]:
X = X.to_numpy(np.float32)
y = y.to_numpy(np.float32)
train_dataset = Dataset(torch.Tensor(X[:700]),torch.Tensor(y[:700]))
val_dataset = Dataset(torch.Tensor(X[700:]),torch.Tensor(y[700:]))

In [28]:
training_generator = torch.utils.data.DataLoader(train_dataset, batch_size=61, num_workers=0)
valid_generator = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=0)

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TitanicNet()
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=3e-4)
loss_fn = nn.MSELoss()
#scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

In [50]:
for epoch in range(1000):
    model.train()
    for batch_idx, (data, target) in enumerate(training_generator):
        target = target.unsqueeze(1)
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        
    model.eval()
    total = 0
    correct = 0
    for batch_idx, (data, target) in enumerate(valid_generator):
            target = target.unsqueeze(0)
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            correct += ((output > 0.5) == target)
            total += 1
    print(correct/total)

tensor([[0.7801]], device='cuda:0')
tensor([[0.7853]], device='cuda:0')
tensor([[0.7853]], device='cuda:0')
tensor([[0.7853]], device='cuda:0')
tensor([[0.7853]], device='cuda:0')
tensor([[0.7906]], device='cuda:0')
tensor([[0.7853]], device='cuda:0')
tensor([[0.7906]], device='cuda:0')
tensor([[0.7906]], device='cuda:0')
tensor([[0.7958]], device='cuda:0')
tensor([[0.7958]], device='cuda:0')
tensor([[0.7958]], device='cuda:0')
tensor([[0.7958]], device='cuda:0')
tensor([[0.8010]], device='cuda:0')
tensor([[0.8010]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='cuda:0')
tensor([[0.8063]], device='c

In [None]:
model.train()
for batch_idx, (data, target) in enumerate(training_generator):
    #print(target)
    target = target.unsqueeze(1)
    #print(target)
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output = model(data)
    loss = loss_fn(output, target)
    print(output)
    print(target)
    print(loss)
    loss.backward()
    optimizer.step()
    break

In [None]:
model.eval()
total = 0
correct = 0
for batch_idx, (data, target) in enumerate(training_generator):
        target = target.unsqueeze(0)
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        correct += ((output > 0.5) == target)
        total += 1
print(correct/total)

In [None]:
# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# output.to_csv('submission.csv', index=False)
# print("Your submission was successfully saved!")

In [57]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare"]
X = pd.get_dummies(train_data[features])
mean_age = X["Age"].mean(0)
X["Age"] = X["Age"].fillna(mean_age)
X_test = pd.get_dummies(test_data[features])
X_test["Age"] = X_test["Age"].fillna(mean_age)
mean_fare = X["Fare"].mean(0)
X_test["Fare"] = X_test["Fare"].fillna(mean_fare)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [46]:
model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=60)
model.fit(X[:600], y[:600])
predictions = model.predict(X[600:])
sum(y[600:] == predictions)/len(y[600:])

0.8350515463917526

In [9]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X[:600], y[:600])
predictions = melbourne_model.predict(X[600:])
sum(y[600:] == predictions)/len(y[600:])

0.7216494845360825

In [10]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X[:600], y[:600])
predictions = reg.predict(X[600:])
sum(y[600:] == predictions)/len(y[600:])

0.0

In [70]:
 from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(60, 240),random_state=3, max_iter=300)

clf.fit(X[:700], y[:700])
predictions = clf.predict(X[700:])
sum(y[700:] == (predictions > 0.5))/len(y[700:])

0.8324607329842932

In [71]:
predictions = clf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [12]:
from sklearn import svm 

svc_clf = svm.LinearSVC()

svc_clf.fit(X[:600], y[:600])
predictions = svc_clf.predict(X[600:])
sum(y[600:] == (predictions > 0.5))/len(y[600:])




0.8006872852233677

In [76]:
from sklearn import neighbors

nbrs = neighbors.KNeighborsClassifier(n_neighbors=5)

nbrs.fit(X[:600], y[:600])
predictions = nbrs.predict(X[600:])
sum(y[600:] == (predictions > 0.5))/len(y[600:])

0.7457044673539519

In [23]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="hinge", max_iter=90000000)

sgd_clf.fit(X[:600], y[:600])
predictions = sgd_clf.predict(X[600:])
sum(y[600:] == (predictions > 0.5))/len(y[600:])

0.6357388316151202

In [24]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_clf = QuadraticDiscriminantAnalysis()

qda_clf.fit(X[:600], y[:600])
predictions = qda_clf.predict(X[600:])
sum(y[600:] == (predictions > 0.5))/len(y[600:])



0.6426116838487973