In [56]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

# Step 1: Download Data

In [1]:
# !kaggle competitions download -c titanic

Downloading titanic.zip to C:\Users\peter\Documents\GitHub\Kaggle-Competition\Titanic




  0%|          | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████| 34.1k/34.1k [00:00<00:00, 18.0MB/s]


# Step 2: Data Exploration

In [70]:
# Read train/test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [71]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [73]:
# Check for any NaN entries.
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [74]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Step 3: Data Cleaning

#### Remove all columns "Age", "Name", "SibSp", "Parch", "Ticket", "Cabin" and "Embarked"

In [75]:
train = train.drop(['Age', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], 1)
test = test.drop(['Age', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], 1)

In [76]:
print(len(train))
print(train.columns)
print(train.isnull().sum())

891
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Fare'], dtype='object')
PassengerId    0
Survived       0
Pclass         0
Sex            0
Fare           0
dtype: int64


# Step 4: Training

In [77]:
# Split training set to data and label.
train_label = train['Survived'].to_numpy()
train_data = train.drop(['Survived', 'PassengerId'], 1).to_numpy()

[[3 'male' 7.25]
 [1 'female' 71.2833]
 [3 'female' 7.925]
 ...
 [3 'female' 23.45]
 [1 'male' 30.0]
 [3 'male' 7.75]]


In [78]:
# Change 'male', 'female' to 0/1.
for line in train_data:
    if line[1] == 'male':
        line[1] = 0
    elif line[1] == 'female':
        line[1] = 1

[[3 0 7.25]
 [1 1 71.2833]
 [3 1 7.925]
 ...
 [3 1 23.45]
 [1 0 30.0]
 [3 0 7.75]]


In [79]:
clf = LogisticRegression(random_state=0).fit(train_data, train_label)

In [80]:
clf.score(train_data, train_label)

0.7867564534231201

# Step 5: Testing

In [82]:
test_data = test.drop(['PassengerId'], 1).to_numpy()

[[3 'male' 7.8292]
 [3 'female' 7.0]
 [2 'male' 9.6875]
 ...
 [3 'male' 7.25]
 [3 'male' 8.05]
 [3 'male' 22.3583]]


In [85]:
# Change 'male', 'female' to 0/1.
for line in test_data:
    if line[1] == 'male':
        line[1] = 0
    elif line[1] == 'female':
        line[1] = 1

[[3 0 7.8292]
 [3 1 7.0]
 [2 0 9.6875]
 ...
 [3 0 7.25]
 [3 0 8.05]
 [3 0 22.3583]]


In [132]:
import math

# Calculate average of Fares and assign to the nan entry.
test1 = test[test['Pclass'] == 3]
test2 = test1[test1['Sex'] == 'male']
Fares = test2['Fare'].to_numpy()
Fares = Fares[~np.isnan(Fares)]

for line in test_data:
    if math.isnan(line[2]):
        line[2] = np.average(Fares)

In [137]:
# Create submission.
results = clf.predict(test_data)
ids = test['PassengerId'].to_numpy()
df = pd.DataFrame(data = {'PassengerId': ids, 'Survived': results})
df.to_csv('submission.csv', index = False)

In [138]:
# !kaggle competitions submit -c titanic -f submission.csv -m "First Submission"

Successfully submitted to Titanic: Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|██████████| 3.18k/3.18k [00:00<00:00, 12.1kB/s]
100%|██████████| 3.18k/3.18k [00:01<00:00, 2.65kB/s]
