In [None]:
import pandas as pd

titanic = pd.read_csv('train.csv')
titanic.info()

In [None]:
titanic = titanic[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Embarked", "Survived"]]
titanic = titanic.dropna()
titanic = titanic.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder

categorical_features = titanic[titanic.select_dtypes(include=['object']).columns.tolist()]
numerical_features = titanic[titanic.select_dtypes(exclude=['object']).columns].drop('Survived', axis=1)
label_features = titanic['Survived']

In [None]:
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder

numerical_features_arr = MinMaxScaler().fit_transform(numerical_features)
print (categorical_features)
categorical_features_arr = OneHotEncoder().fit_transform(categorical_features).toarray()
print (categorical_features_arr)

In [None]:
combined_features = pd.DataFrame(data=numerical_features_arr, columns=numerical_features.columns)
combined_features = pd.concat([combined_features, pd.DataFrame(data=categorical_features_arr)], axis=1)
combined_features = pd.concat([combined_features, label_features], axis=1).reset_index(drop=True)

print (combined_features)

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(combined_features, test_size=0.2, random_state=42)
print (len(train_data), len(test_data))

In [None]:
import torch.nn as nn
import torch.optim as optim

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H1, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1) # this will create weight, bias for linear1
        self.linear2 = nn.Linear(H1, D_out) # this will create weight, bias for linear2
        self.sigmoid = nn.Sigmoid() # Sigmoid activation for binary classification

    def forward(self, x):
        h_relu = F.relu(self.linear1(x))
        y_pred = self.sigmoid(self.linear2(h_relu))
        return y_pred

D_in, H1, D_out = 10, 8, 1
lr = 0.01

network = TwoLayerNet(D_in, H1, D_out)
optimizer = optim.Adam(network.parameters(), lr)
criterion = nn.BCELoss() # Define the loss function as Binary Cross-Entropy Loss

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

train_data = pd.DataFrame(data=train_data, columns=train_data.columns)
loss_array = []
                          
for i in range(5000):
    optimizer.zero_grad()
    df_selected = train_data.iloc[:500]
    X = df_selected.iloc[:, :-1].values
    y = df_selected.iloc[:, -1].values
    X_tensor = torch.tensor(X, dtype=torch.float32)

    y_tensor = torch.tensor(y, dtype=torch.float32)
    pred_y = network.forward(X_tensor)
    pred = (pred_y >= 0.5).float()
    loss = criterion(pred_y.squeeze(), y_tensor)
    loss_array.append(loss.item())

    # Backward propagation
    loss.backward()  # Calculate the gradient of the loss
    optimizer.step()  # Update the gradient

plt.plot(loss_array)
plt.show()

In [None]:
correct=0
wrong = 0
df_selected = test_data.iloc[:100]
X = df_selected.iloc[:, :-1].values
y = df_selected.iloc[:, -1].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
pred_y = network.forward(X_tensor)
pred = (pred_y >= 0.5).float()  # Since the answers are either 0 or 1, we need to set a threshold where >= 0.5 is 1 and < 0.5 is 0
for i in range(100):
    if pred[i].item() == y_tensor[i].item():
        correct += 1
    else:
        wrong += 1

print (correct/(correct + wrong))

In [None]:
import torch.nn as nn
import torch.optim as optim

class MultiLayerNet(nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(MultiLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1) 
        self.linear2 = nn.Linear(H1, H2) 
        self.linear3 = nn.Linear(H2, H3) 
        self.linear4 = nn.Linear(H3, D_out)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h_relu = F.relu(self.linear1(x))
        h_relu = F.relu(self.linear2(h_relu))
        h_relu = F.relu(self.linear3(h_relu))
        y_pred = self.sigmoid(self.linear4(h_relu))
        return y_pred

D_in, H1, H2, H3, D_out = 10, 32, 16, 8, 1
lr = 0.01

network = MultiLayerNet(D_in, H1, H2, H3, D_out)
optimizer = optim.Adam(network.parameters(), lr)
criterion = nn.BCELoss() # Define the loss function as Binary Cross-Entropy Loss

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

train_data = pd.DataFrame(data=train_data, columns=train_data.columns)
loss_array = []
                          
for i in range(1000):
    optimizer.zero_grad()
    df_selected = train_data.iloc[:500]
    X = df_selected.iloc[:, :-1].values
    y = df_selected.iloc[:, -1].values
    X_tensor = torch.tensor(X, dtype=torch.float32)

    y_tensor = torch.tensor(y, dtype=torch.float32)
    pred_y = network.forward(X_tensor)
    pred = (pred_y >= 0.5).float()
    loss = criterion(pred_y.squeeze(), y_tensor)
    loss_array.append(loss.item())

    # Backward propagation
    loss.backward()  # Calculate the gradient of the loss
    optimizer.step()  # Update the gradient

plt.plot(loss_array)
plt.show()

In [None]:
import torch.nn as nn
import torch.optim as optim

class MultiLayerNet(nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(MultiLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1) 
        self.linear2 = nn.Linear(H1, H2) 
        self.linear3 = nn.Linear(H2, H3) 
        self.linear4 = nn.Linear(H3, D_out)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h_relu = F.relu(self.linear1(x))
        h_relu = F.relu(self.linear2(h_relu))
        h_relu = F.relu(self.linear3(h_relu))
        y_pred = self.sigmoid(self.linear4(h_relu))
        return y_pred

D_in, H1, H2, H3, D_out = 10, 32, 16, 8, 1
lr = 0.005

network = MultiLayerNet(D_in, H1, H2, H3, D_out)
optimizer = optim.Adam(network.parameters(), lr)
criterion = nn.BCELoss() # Define the loss function as Binary Cross-Entropy Loss

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

train_data = pd.DataFrame(data=train_data, columns=train_data.columns)
loss_array = []
                          
for i in range(1000):
    optimizer.zero_grad()
    df_selected = train_data.iloc[:500]
    X = df_selected.iloc[:, :-1].values
    y = df_selected.iloc[:, -1].values
    X_tensor = torch.tensor(X, dtype=torch.float32)

    y_tensor = torch.tensor(y, dtype=torch.float32)
    pred_y = network.forward(X_tensor)
    pred = (pred_y >= 0.5).float()
    loss = criterion(pred_y.squeeze(), y_tensor)
    loss_array.append(loss.item())

    # Backward propagation
    loss.backward()  # Calculate the gradient of the loss
    optimizer.step()  # Update the gradient

plt.plot(loss_array)
plt.show()

In [None]:
correct=0
wrong = 0
df_selected = test_data.iloc[:100]
X = df_selected.iloc[:, :-1].values
y = df_selected.iloc[:, -1].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
pred_y = network.forward(X_tensor)
pred = (pred_y >= 0.5).float()  # Since the answers are either 0 or 1, we need to set a threshold where >= 0.5 is 1 and < 0.5 is 0
for i in range(100):
    if pred[i].item() == y_tensor[i].item():
        correct += 1
    else:
        wrong += 1

print (correct/(correct + wrong))

In [None]:
import torch.nn as nn
import torch.optim as optim

class MultiLayerNetDP(nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(MultiLayerNetDP, self).__init__()
        self.linear1 = nn.Linear(D_in, H1) 
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3) 
        self.linear4 = nn.Linear(H3, D_out)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.3)
    
    def forward(self, x):
        h_relu = F.relu(self.linear1(x))
        h_relu = F.relu(self.linear2(h_relu))
        h_relu = self.dropout(h_relu)
        h_relu = F.relu(self.linear3(h_relu))
        h_relu = self.dropout(h_relu)
        y_pred = self.sigmoid(self.linear4(h_relu))
        return y_pred

D_in, H1, H2, H3, D_out = 10, 32, 16, 8, 1
lr = 0.005

network = MultiLayerNet(D_in, H1, H2, H3, D_out)
optimizer = optim.Adam(network.parameters(), lr)
criterion = nn.BCELoss() # Define the loss function as Binary Cross-Entropy Loss

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

train_data = pd.DataFrame(data=train_data, columns=train_data.columns)
loss_array = []
                          
for i in range(5000):
    optimizer.zero_grad()
    df_selected = train_data.iloc[:500]
    X = df_selected.iloc[:, :-1].values
    y = df_selected.iloc[:, -1].values
    X_tensor = torch.tensor(X, dtype=torch.float32)

    y_tensor = torch.tensor(y, dtype=torch.float32)
    pred_y = network.forward(X_tensor)
    pred = (pred_y >= 0.5).float()
    loss = criterion(pred_y.squeeze(), y_tensor)
    loss_array.append(loss.item())

    # Backward propagation
    loss.backward()  # Calculate the gradient of the loss
    optimizer.step()  # Update the gradient

plt.plot(loss_array)
plt.show()

In [None]:
correct=0
wrong = 0
df_selected = test_data.iloc[:100]
X = df_selected.iloc[:, :-1].values
y = df_selected.iloc[:, -1].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
pred_y = network.forward(X_tensor)
pred = (pred_y >= 0.5).float()  # Since the answers are either 0 or 1, we need to set a threshold where >= 0.5 is 1 and < 0.5 is 0
for i in range(100):
    if pred[i].item() == y_tensor[i].item():
        correct += 1
    else:
        wrong += 1

print (correct/(correct + wrong))

## Feature Engineering

In [None]:
print (train_data)

sns.heatmap(train_data[['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 0, 1, 2, 3, 4]].corr(), annot = True, fmt = '.2f', cmap = 'coolwarm')

#### Make fare less skewed

In [None]:
sns.distplot(train_data['Fare'], label = 'Skewness: %.2f'%(train_data['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Fare Distribution')

In [None]:
import numpy as np

train_data['Fare'] = train_data['Fare'].map(lambda x: np.log(x) if x > 0 else 0)
sns.distplot(train_data['Fare'], label = 'Skewness: %.2f'%(train_data['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Fare Distribution')

In [None]:
from collections import Counter

def detect_outliers(df, n, features):
    """"
    This function will loop through a list of features and detect outliers in each one of those features. In each
    loop, a data point is deemed an outlier if it is less than the first quartile minus the outlier step or exceeds
    third quartile plus the outlier step. The outlier step is defined as 1.5 times the interquartile range. Once the 
    outliers have been determined for one feature, their indices will be stored in a list before proceeding to the next
    feature and the process repeats until the very last feature is completed. Finally, using the list with outlier 
    indices, we will count the frequencies of the index numbers and return them if their frequency exceeds n times.    
    """
    outlier_indices = [] 
    for col in features: 
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR 
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col) 
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(key for key, value in outlier_indices.items() if value > n) 
    return multiple_outliers

outliers_to_drop = detect_outliers(train_data, 2, ['Age', 'SibSp', 'Parch', 'Fare'])
print("We will drop these {} indices: ".format(len(outliers_to_drop)), outliers_to_drop)

train_data.loc[outliers_to_drop, :]
train_data = train_data.drop(outliers_to_drop, axis = 0).reset_index(drop = True)

In [None]:
new_train_data = train_data.drop(2, axis = 1)
new_train_data = new_train_data.drop(3, axis = 1)
new_train_data = new_train_data.drop(4, axis = 1)
new_train_data = new_train_data.drop('SibSp', axis = 1)


import torch.nn as nn
import torch.optim as optim

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H1, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1) # this will create weight, bias for linear1
        self.linear2 = nn.Linear(H1, D_out) # this will create weight, bias for linear2
        self.sigmoid = nn.Sigmoid() # Sigmoid activation for binary classification

    def forward(self, x):
        h_relu = F.relu(self.linear1(x))
        y_pred = self.sigmoid(self.linear2(h_relu))
        return y_pred

D_in, H1, D_out = 6, 3, 1
lr = 0.005

network = TwoLayerNet(D_in, H1, D_out)
optimizer = optim.Adam(network.parameters(), lr)
criterion = nn.BCELoss() # Define the loss function as Binary Cross-Entropy Loss

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

train_data = pd.DataFrame(data=train_data, columns=train_data.columns)
loss_array = []
                          
for i in range(10000):
    optimizer.zero_grad()
    df_selected = new_train_data.iloc[:500]
    X = df_selected.iloc[:, :-1].values
    y = df_selected.iloc[:, -1].values
    X_tensor = torch.tensor(X, dtype=torch.float32)

    y_tensor = torch.tensor(y, dtype=torch.float32)
    pred_y = network.forward(X_tensor)
    pred = (pred_y >= 0.5).float()
    loss = criterion(pred_y.squeeze(), y_tensor)
    loss_array.append(loss.item())

    # Backward propagation
    loss.backward()  # Calculate the gradient of the loss
    optimizer.step()  # Update the gradient

plt.plot(loss_array)
plt.show()

In [None]:
test_data['Fare'] = test_data['Fare'].map(lambda x: np.log(x) if x > 0 else 0)

new_test_data = test_data.drop(2, axis = 1)
new_test_data = new_test_data.drop(3, axis = 1)
new_test_data = new_test_data.drop(4, axis = 1)
new_test_data = new_test_data.drop('SibSp', axis = 1)


correct=0
wrong = 0
df_selected = new_test_data.iloc[:100]
X = df_selected.iloc[:, :-1].values
y = df_selected.iloc[:, -1].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
pred_y = network.forward(X_tensor)
pred = (pred_y >= 0.5).float()  # Since the answers are either 0 or 1, we need to set a threshold where >= 0.5 is 1 and < 0.5 is 0
for i in range(100):
    if pred[i].item() == y_tensor[i].item():
        correct += 1
    else:
        wrong += 1

print (correct/(correct + wrong))

In [None]:
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier


df_selected = train_data.iloc[:500]
X = df_selected.iloc[:, :-1].values
y = df_selected.iloc[:, -1].values
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

df_test_selected = test_data.iloc[:100]
X_test = df_test_selected.iloc[:, :-1].values
y_test = df_test_selected.iloc[:, -1].values
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


logreg = LogisticRegression()
logreg.fit(X_tensor, y_tensor)
Y_pred = logreg.predict(X_test_tensor)
acc_log = round(logreg.score(X_test_tensor, y_test_tensor) * 100, 2)
acc_log

In [None]:
perceptron = Perceptron()
perceptron.fit(X_tensor, y_tensor)
Y_pred = perceptron.predict(X_tensor)
acc_perceptron = round(perceptron.score(X_test_tensor, y_test_tensor) * 100, 2)
acc_perceptron

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_tensor, y_tensor)
Y_pred = decision_tree.predict(X_tensor)
acc_decision_tree = round(decision_tree.score(X_test_tensor, y_test_tensor) * 100, 2)
acc_decision_tree

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100)
random_forest.fit(X_tensor, y_tensor)
# Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_test_tensor, y_test_tensor) * 100, 2)
acc_random_forest