In [1]:
import numpy as np
import pandas as pd

In [2]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [None]:
data_train.head(2)

In [None]:
data_test.head(2)

In [3]:
y_train = data_train["Survived"]

In [4]:
train_cols_to_drop = ["PassengerId", "Survived", "Name", "Ticket", "Cabin", "Embarked"]
test_cols_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Embarked"]

In [5]:
X_train = data_train.drop(train_cols_to_drop, axis=1)
X_test = data_test.drop(test_cols_to_drop, axis=1)

In [6]:
X_train["Sex"] = (X_train["Sex"] == "male").astype(int)
X_test["Sex"] = (X_test["Sex"] == "male").astype(int)

In [7]:
X_train = X_train.fillna(X_train["Age"].mean())
X_test = X_test.fillna(X_test["Age"].mean())

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

## test with logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression()

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
lr_ans = lr_model.predict(X_test)

In [None]:
lr_ans

In [None]:
lr_ans_df = pd.DataFrame({"PassengerId": data_test["PassengerId"], "Survived": lr_ans})

In [None]:
lr_ans_df.to_csv("outputs/ans_lr_1.csv", index=False)

## test with random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_ans = rf_model.predict(X_test)

In [None]:
rf_ans

In [None]:
rf_ans_df = pd.DataFrame({"PassengerId": data_test["PassengerId"], "Survived": rf_ans})

In [None]:
rf_ans_df.to_csv("outputs/ans_rf_1.csv", index=False)

## test with tensorflow

In [8]:
# This code cell is to get rid of annoying tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [9]:
import tensorflow as tf

In [10]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833


In [11]:
X_train_arr = X_train.to_numpy()

In [12]:
y_train_arr = y_train.to_numpy()

In [13]:
tf_model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2, activation="softmax")
])

In [14]:
tf_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [15]:
tf_model.fit(X_train_arr, y_train_arr, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdf9c18f670>

In [16]:
tf_ans = np.argmax(tf_model.predict(X_test), 1)



In [17]:
tf_ans

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [18]:
tf_ans_df = pd.DataFrame({"PassengerId": data_test["PassengerId"], "Survived": tf_ans})

In [19]:
tf_ans_df.to_csv("outputs/ans_tf_8.csv", index=False)