### Titanic ML competition at Kaggle (https://www.kaggle.com/c/titanic/overview)

In [255]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [256]:
data = pd.read_csv("/Volumes/Docs/Programming/Kaggle/train.csv")

In [257]:
# посмотрим на данные
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [258]:
# удалим неинтересную информацию
# закодируем категориальные признаки
data = data.drop(["Name", "Cabin", "Ticket"], axis = 1)
data.Sex = data.Sex.map({'female':0, 'male':1})
data.Embarked = data.Embarked.map({'S':0, 'C':1, 'Q':2})

In [259]:
# посмотрим, что осталось
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,0.0
1,2,1,1,0,38.0,1,0,71.2833,1.0
2,3,1,3,0,26.0,0,0,7.9250,0.0
3,4,1,1,0,35.0,1,0,53.1000,0.0
4,5,0,3,1,35.0,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.0,0,0,13.0000,0.0
887,888,1,1,0,19.0,0,0,30.0000,0.0
888,889,0,3,0,,1,2,23.4500,0.0
889,890,1,1,1,26.0,0,0,30.0000,1.0


In [260]:
# посмотрим, сбалансированы ли классы
data.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [261]:
# Посмотрим, есть ли пропуски данных
for column in data.columns:
    print('Столбец = {:<12}: количество пропусков = {}'.format(column, data[column].isna().sum()))

Столбец = PassengerId : количество пропусков = 0
Столбец = Survived    : количество пропусков = 0
Столбец = Pclass      : количество пропусков = 0
Столбец = Sex         : количество пропусков = 0
Столбец = Age         : количество пропусков = 177
Столбец = SibSp       : количество пропусков = 0
Столбец = Parch       : количество пропусков = 0
Столбец = Fare        : количество пропусков = 0
Столбец = Embarked    : количество пропусков = 2


In [262]:
# заполним пропуски - средним возрастом и модой порта
data.Age = data.Age.fillna(data.Age.mean())
# data.Embarked = data.Embarked.fillna(data.Embarked.mode()) - почему-то не работает
data.Embarked = data.Embarked.fillna(0) # мода столбца = 0

In [263]:
# Посмотрим, остались ли пропуски данных
for column in data.columns:
    print('Столбец = {:<12}: количество пропусков = {}'.format(column, data[column].isna().sum()))

Столбец = PassengerId : количество пропусков = 0
Столбец = Survived    : количество пропусков = 0
Столбец = Pclass      : количество пропусков = 0
Столбец = Sex         : количество пропусков = 0
Столбец = Age         : количество пропусков = 0
Столбец = SibSp       : количество пропусков = 0
Столбец = Parch       : количество пропусков = 0
Столбец = Fare        : количество пропусков = 0
Столбец = Embarked    : количество пропусков = 0


In [264]:
X = data.drop("Survived", axis = 1)
y = data.Survived

In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify = y)

### Применим LogisticRegression

In [266]:
lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       182
           1       0.75      0.75      0.75       113

    accuracy                           0.81       295
   macro avg       0.80      0.80      0.80       295
weighted avg       0.81      0.81      0.81       295

[[153  29]
 [ 28  85]]


### Сделаем предикт на отложенных данных - test.csv

In [267]:
data_test = pd.read_csv("/Volumes/Docs/Programming/Kaggle/test.csv")

In [268]:
# сделаем такой же препроцессинг данных
data_test = data_test.drop(["Name", "Cabin", "Ticket"], axis = 1)
data_test.Sex = data_test.Sex.map({'female':0, 'male':1})
data_test.Embarked = data_test.Embarked.map({'S':0, 'C':1, 'Q':2})
data_test.Age = data_test.Age.fillna(data_test.Age.mean())
data_test.Embarked = data_test.Embarked.fillna(0)
data_test.Fare = data_test.Fare.fillna(data_test.Fare.mean())

In [269]:
# Посмотрим, остались ли пропуски данных
for column in data_test.columns:
    print('Столбец = {:<12}: количество пропусков = {}'.format(column, data_test[column].isna().sum()))

Столбец = PassengerId : количество пропусков = 0
Столбец = Pclass      : количество пропусков = 0
Столбец = Sex         : количество пропусков = 0
Столбец = Age         : количество пропусков = 0
Столбец = SibSp       : количество пропусков = 0
Столбец = Parch       : количество пропусков = 0
Столбец = Fare        : количество пропусков = 0
Столбец = Embarked    : количество пропусков = 0


In [270]:
data_test['Survived'] = lr.predict(data_test)

In [271]:
data_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,1,34.50000,0,0,7.8292,2,0
1,893,3,0,47.00000,1,0,7.0000,0,0
2,894,2,1,62.00000,0,0,9.6875,2,0
3,895,3,1,27.00000,0,0,8.6625,0,0
4,896,3,0,22.00000,1,1,12.2875,0,1
...,...,...,...,...,...,...,...,...,...
413,1305,3,1,30.27259,0,0,8.0500,0,0
414,1306,1,0,39.00000,0,0,108.9000,1,1
415,1307,3,1,38.50000,0,0,7.2500,0,0
416,1308,3,1,30.27259,0,0,8.0500,0,0


In [272]:
data_test_dump = data_test.drop(['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Embarked'], axis = 1)

In [273]:
data_test_dump

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [274]:
data_test_dump.to_csv("/Volumes/Docs/Programming/Kaggle/data_dump.csv", index = False)