In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Functions

# Do data cleaning
def clean_data(data):

    # Remove irrelevant data
    data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)

    # Handle missing values
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    # Remove duplicate values
    
    data.Embarked.fillna("U", inplace=True)

    return data

In [3]:
# Load data

train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
test_ids = test_data["PassengerId"]

In [4]:
# Clean data
train_data = clean_data(train_data)
test_data = clean_data(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [5]:
# Preprocessing

label_encoder = preprocessing.LabelEncoder()
cols = ["Sex", "Embarked"]

for col in cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

In [6]:
# Split Data for Train Set

y = train_data["Survived"]
x = train_data.drop("Survived", axis=1)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
# Create Machine Learning Classifier
clf = LogisticRegression(random_state=0, max_iter=1000).fit(x_train, y_train)

In [8]:
# Get predictions and Accuracy Score of Classifier in Train/Val set
predictions = clf.predict(x_val)
accuracy_score(y_val, predictions)

0.8100558659217877

In [9]:
submissions_preds = clf.predict(test_data)

In [10]:
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0000,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2
...,...,...,...,...,...,...,...
413,3,1,27.0,0,0,8.0500,2
414,1,0,39.0,0,0,108.9000,0
415,3,1,38.5,0,0,7.2500,2
416,3,1,27.0,0,0,8.0500,2


In [11]:
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submissions_preds,
                  })

In [12]:
df.to_csv("submission.csv", index=False)