In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import sklearn.metrics as sm


# STEP 1: Load training data
titanic_data = pd.read_csv ("./Data/train.csv")

In [61]:
# STEP 2: Data Exploration

# shape
titanic_data.shape

(891, 12)

In [62]:
# names of variables
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [63]:
# missing data
titanic_data.isnull().sum ()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [64]:
# summary
titanic_data.describe ()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [65]:
# peek
titanic_data.head ()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
# Dealing with missing values
## Age
mean_age = titanic_data.Age.mean ()
titanic_data.Age.fillna (round(mean_age), inplace=True)

## Cabin
mode_cabin = titanic_data.Cabin.mode ()
titanic_data.Cabin.fillna (mode_cabin[2], inplace=True)

## Embarked
mode_embarked = titanic_data.Embarked.mode ()
titanic_data.Embarked.fillna (mode_embarked[0], inplace=True)
titanic_data.isnull().sum ()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [67]:
# Handle Categorical Data
# pd.get_dummies (titanic_data, prefix="Cat", columns=["Sex", "Embarked"])
titanic_data["Sex_factorized"] = pd.factorize (titanic_data["Sex"])[0]
titanic_data["Cabin_factorized"] = pd.factorize (titanic_data["Cabin"])[0]
titanic_data["Embarked_factorized"] = pd.factorize (titanic_data["Embarked"])[0]

titanic_data.head ()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_factorized,Cabin_factorized,Embarked_factorized
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,G6,S,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G6,S,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,G6,S,0,0,0


In [68]:
# STEP 3: Identify prediction target and features
titanic_features = ["Pclass",
                    "Sex_factorized",
                    "Age",
                    "SibSp",
                    "Parch",
                    "Fare",
                    "Cabin_factorized",
                    "Embarked_factorized"]
y = titanic_data.Survived
X = titanic_data [titanic_features]


# STEP 4: Create a model
titanic_model = lm.LogisticRegression ()

# STEP 5: Train the model
titanic_model.fit (X, y)

titanic_data.head ()
titanic_model.predict (X.head ())

# STEP 5: Model Validation

# MAE
sm.mean_absolute_error (titanic_data.Survived, titanic_model.predict (X))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.20089786756453423

In [70]:
# STEP 6: underfitting and overfitting
def get_mae (C, train_X, train_y):
  model = lm.LogisticRegression (random_state=0, C=C)
  model.fit (train_X, train_y)
  train_pred = model.predict (train_X)
  mae = sm.mean_absolute_error (train_y, train_pred)
  return mae

for c in [5, 50, 500, 5000, 50000]:
  print (f"mae: ", get_mae (c, X, y))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

mae:  0.20426487093153758
mae:  0.19977553310886645
mae:  0.20426487093153758
mae:  0.20089786756453423
mae:  0.2031425364758698


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:

titanic_model = lm.LogisticRegression (random_state=0, C=50)

titanic_model.fit (X, y)
# validation data
val_data = pd.read_csv ("./Data/test.csv")


mean_age = val_data.Age.mean ()
val_data.Age.fillna (round(mean_age), inplace=True)

mode_cabin = val_data.Cabin.mode ()
val_data.Cabin.fillna (mode_cabin[0], inplace=True)

mean_fare = val_data.Fare.mean ()
val_data.Fare.fillna (mean_fare, inplace=True)


def factorize_categorical_data (data):
  data["Sex_factorized"] = pd.factorize (data["Sex"])[0]
  data["Cabin_factorized"] = pd.factorize (data["Cabin"])[0]
  data["Embarked_factorized"] = pd.factorize (data["Embarked"])[0]
  
  return data


val_data = factorize_categorical_data (val_data)

val_X = val_data [titanic_features]

val_data.head ()
titanic_model.predict (val_X.head ())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0, 0, 0, 1])

In [91]:
val_data ["Survived"] = titanic_model.predict (val_X)
# titanic_model.predict (val_X)
val_data[["PassengerId", "Survived"]].to_csv ("./Solution/titanic_submission.csv", index=False)