In [63]:
import os
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



In [64]:
# Загрузка данных Титаника из csv-файла
df = pd.read_csv('titanic.csv')
# data = data [['Survived','Pclass','Age','Fare']]
# data = data.dropna(subset=['Age'])

## Предварительная обработка данных

In [None]:
df.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [None]:
df.iloc[:,-2].unique()
df.iloc[:,-2].value_counts()
df.shape

(891, 12)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Удаляем столбцы, в которых нет ценной для нас информации
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 
        axis=1, 
        inplace=True)

In [None]:
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
df.Fare.isna().sum(), df.Embarked.isna().sum(), df.shape

(np.int64(0), np.int64(2), (891, 8))

In [None]:
df = df.dropna(subset=['Age'])
df = df.dropna(subset=['Embarked'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


In [None]:
# Округляем стоимость билета до двух знаков после запятой
df['Fare'] = round(df['Fare']).astype(int)
df['Age'] = df['Age'].astype(int)
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7,S
1,1,1,female,38,1,0,71,C
2,1,3,female,26,0,0,8,S
3,1,1,female,35,1,0,53,S
4,0,3,male,35,0,0,8,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  712 non-null    int64 
 1   Pclass    712 non-null    int64 
 2   Sex       712 non-null    object
 3   Age       712 non-null    int64 
 4   SibSp     712 non-null    int64 
 5   Parch     712 non-null    int64 
 6   Fare      712 non-null    int64 
 7   Embarked  712 non-null    object
dtypes: int64(6), object(2)
memory usage: 50.1+ KB


Осталось поменять типы данных в столбцах Sex и Embarked

In [None]:
df.Sex.unique(), df.Embarked.unique()

(array(['male', 'female'], dtype=object), array(['S', 'C', 'Q'], dtype=object))

In [None]:
df['Sex'] =  np.where(df['Sex']=="female", 0, 1)
df['Embarked'] =  np.where(df['Embarked']=="S", 0, np.where(df['Embarked']=="C", 1, 2))
                # np.where(df[1]=="B", "1",
                #         np.where(df[1]=="C","2",
                #                 np.where(df[1]=="D","3",np.nan))))
print(df)

     Survived  Pclass  Sex  Age  SibSp  Parch  Fare  Embarked
0           0       3    1   22      1      0     7         0
1           1       1    0   38      1      0    71         1
2           1       3    0   26      0      0     8         0
3           1       1    0   35      1      0    53         0
4           0       3    1   35      0      0     8         0
..        ...     ...  ...  ...    ...    ...   ...       ...
885         0       3    0   39      0      5    29         2
886         0       2    1   27      0      0    13         0
887         1       1    0   19      0      0    30         0
889         1       1    1   26      0      0    30         1
890         0       3    1   32      0      0     8         2

[712 rows x 8 columns]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Survived  712 non-null    int64
 1   Pclass    712 non-null    int64
 2   Sex       712 non-null    int64
 3   Age       712 non-null    int64
 4   SibSp     712 non-null    int64
 5   Parch     712 non-null    int64
 6   Fare      712 non-null    int64
 7   Embarked  712 non-null    int64
dtypes: int64(8)
memory usage: 50.1 KB


In [None]:
df.to_csv('titanic_clear.csv', sep=',', encoding='utf-8', index=False)

In [None]:
pd.read_csv('titanic_clear.csv')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7,0
1,1,1,0,38,1,0,71,1
2,1,3,0,26,0,0,8,0
3,1,1,0,35,1,0,53,0
4,0,3,1,35,0,0,8,0
...,...,...,...,...,...,...,...,...
707,0,3,0,39,0,5,29,2
708,0,2,1,27,0,0,13,0
709,1,1,0,19,0,0,30,0
710,1,1,1,26,0,0,30,1


In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.356462,-0.536762,-0.082377,-0.015523,0.095265,0.265888,0.108517
Pclass,-0.356462,1.0,0.150826,-0.366125,0.065187,0.023666,-0.553044,-0.108502
Sex,-0.536762,0.150826,1.0,0.098535,-0.106296,-0.249543,-0.18204,-0.097129
Age,-0.082377,-0.366125,0.098535,1.0,-0.30705,-0.188084,0.093519,0.011393
SibSp,-0.015523,0.065187,-0.106296,-0.30705,1.0,0.383338,0.139785,0.004021
Parch,0.095265,0.023666,-0.249543,-0.188084,0.383338,1.0,0.206402,-0.014082
Fare,0.265888,-0.553044,-0.18204,0.093519,0.139785,0.206402,1.0,0.176977
Embarked,0.108517,-0.108502,-0.097129,0.011393,0.004021,-0.014082,0.176977,1.0


Короче тут выжившие больше всего коррелируются с классом, полом и стоимостью билета

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Survived  712 non-null    int64
 1   Pclass    712 non-null    int64
 2   Sex       712 non-null    int64
 3   Age       712 non-null    int64
 4   SibSp     712 non-null    int64
 5   Parch     712 non-null    int64
 6   Fare      712 non-null    int64
 7   Embarked  712 non-null    int64
dtypes: int64(8)
memory usage: 50.1 KB


Survived — факт выживания (1 = выжил, 0 = погиб).
Pclass — класс билета/социально-экономический статус (1 = высший, 2 = средний, 3 = низший).
Sex — пол пассажира.
Age — возраст пассажира (часть значений отсутствует).
SibSp — число родных братьев/сестёр или супругов на борту.
Parch — число родителей/детей на борту.
Fare — стоимость проезда.
Embarked — порт посадки (С = Шербург, Q = Куинстаун, S = Саутгемптон).

Все данные готовы и переданы в файл titanic_clear.csv