In [13]:
#Kaggle competition "Titanic - Machine Learning from Disaster"

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


FEATURES = [
    "PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare",
]


def preprocess(alldata):
    alldata["Sex"] = alldata["Sex"].apply(lambda x: 1 if x in ["female"] else 0)
    alldata["is_miss"] = alldata["Name"].apply(
    lambda x: 1 if "miss" in x.lower() else 0)

    alldata["is_mrs"] = alldata["Name"].apply(
        lambda x: 1 if "mrs" in x.lower() else 0)
    
    alldata=alldata.fillna(value={'Age':alldata["Age"].mean()})
    
    alldata["Cabin"] = alldata["Cabin"].apply(str)

    deck_types = ['A','B','C','D','E','F','G','T']
    select_column_names = []

    for i in deck_types:
        new_column_name = "deck{}".format(i)
        alldata[new_column_name] = alldata["Cabin"].apply(
        lambda x: 1 if i in x else 0)
        select_column_names.append(new_column_name)
    
    alldata.dropna(subset=["Embarked"], inplace=True)
    
    alldata["Embarked"] = alldata["Embarked"].apply(str)

    port_types = ['C','Q','S']

    for i in port_types:
        new_column_name = "is {}".format(i)
        alldata[new_column_name] = alldata["Embarked"].apply(
        lambda x: 1 if i in x else 0)
        
    alldata.drop("Name", axis=1, inplace=True)
    alldata.drop("Ticket", axis=1, inplace=True)
    alldata.drop("Cabin", axis=1, inplace=True)
    alldata.drop("Embarked", axis=1, inplace=True)
    
    return alldata

dataset_filename = "titanic.tsv"
  
alldata = pd.read_csv(
    dataset_filename,
    header=0,
    sep="\t",
    usecols=[
        "Survived","PassengerId","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked",
    ],
)

columns = alldata.columns[1:]
alldata = preprocess(alldata)
alldata = alldata[FEATURES + ["Survived"]]  

data_train, data_test = train_test_split(alldata, test_size=0.2)

y_train = pd.DataFrame(data_train["Survived"])
x_train = pd.DataFrame(data_train[FEATURES])
model = LogisticRegression() 
model.fit(x_train, y_train)

y_expected = pd.DataFrame(data_test["Survived"])
x_test = pd.DataFrame(data_test[FEATURES])
y_predicted = model.predict(x_test)

print(y_predicted) 


[1 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1
 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0
 0 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1
 0 1 1 0 0 0 1 1 1 0 1 0 0 0]


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
precision, recall, fscore, support = precision_recall_fscore_support(
    y_expected, y_predicted, average="micro"
)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {fscore}")

score = model.score(x_test, y_expected)

print(f"Model score: {score}")

Precision: 0.808
Recall: 0.808
F-score: 0.808
Model score: 0.808
