In [1]:
import pandas as pd
import numpy as np
import re
 
from tensorflow import keras
 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense , Dropout, Flatten,BatchNormalization
from tensorflow.keras.optimizers import Adam
 
import plotly.graph_objects as go
import plotly.express as px

 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

- Данный ноутбук использует API kaggle.
- Подробнее об использовании API [на github](https://github.com/Kaggle/kaggle-api)
- Данные также можно скачать на [странице конкурса](https://www.kaggle.com/c/titanic/data)

In [3]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes        292           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes        130           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        370           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2481           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      18445            True  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4855            True  
connectx

In [4]:
!kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# Чтение данных
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [6]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Описание полей:
- PassengerId - номер пассажира
- Survived - 1(выжил),0(утонул)
- Pclass - класс, которым путешествовал пассажир (1,2,3)
- Name - имя
- Sex - male, female
- Age - возраст
- SibSp - это число братьев, сестер или супругов на борту у этого пассажира
- Parch - количество родителей, детей (в том числе приемных) на борту у этого пассажира
- Ticket - номер билета
- Fare - плата за билет
- Cabin - номер каюты
- Embarked - порт посадки (C — Шербур; Q — Квинстаун; S — Саутгемптон)

In [7]:
train_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# Считывание целевой переменной в отдельный массив
len_train_data = len(train_data)
y_train = train_data['Survived'].values # целевая переменная
print(len(y_train))
print(y_train[:10])

891
[0 1 1 1 0 0 0 0 1 1]


In [10]:
# Переиндексируем данные и удаляем некоторые столбцы
train_data.index = train_data.PassengerId
del train_data['PassengerId']
del train_data['Survived']

test_data.index = test_data.PassengerId
del test_data['PassengerId']

train_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
# Сливаем данные в один датасет для обработки
all_data = pd.concat([train_data, test_data])
all_data.drop(['Ticket','Cabin'], axis = 1,inplace=True)
print('Размер полученных данных:', all_data.shape)
all_data.head()

Размер полученных данных: (1309, 8)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


### Поиск признаков

In [13]:

# Размер семьи
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1

# Признак одинокого плавания
all_data['Is_alone'] = all_data["FamilySize"].apply(lambda x: 0 if x > 1 else 1)

In [14]:
all_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Is_alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0
5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1


In [15]:
# Использование регулярных выражений для поиска сокращений нейтральных обращений

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

# Применение к столбцу Name
all_data['Title'] = all_data['Name'].apply(get_title)
all_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Is_alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0,Mr
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0,Mrs
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1,Miss
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0,Mrs
5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1,Mr


In [16]:
# Просмотр результатов предобработки
all_data['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Ms            2
Mlle          2
Major         2
Dona          1
Jonkheer      1
Capt          1
Don           1
Mme           1
Sir           1
Countess      1
Lady          1
Name: Title, dtype: int64

In [17]:
# Исправление опечаток и группировака редких обращений
all_data['Title'] = all_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data['Title'] = all_data['Title'].replace('Mlle', 'Miss')
all_data['Title'] = all_data['Title'].replace('Ms', 'Miss')
all_data['Title'] = all_data['Title'].replace('Mme', 'Mrs')

In [18]:
all_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Is_alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0,Mr
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0,Mrs
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1,Miss
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0,Mrs
5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1,Mr


In [19]:
# Очистка данных от обработанных столбцов
all_data.drop(['Parch','SibSp','Name'], axis = 1,inplace=True)

In [20]:
all_data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Is_alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,7.25,S,2,0,Mr
2,1,female,38.0,71.2833,C,2,0,Mrs
3,3,female,26.0,7.925,S,1,1,Miss
4,1,female,35.0,53.1,S,2,0,Mrs
5,3,male,35.0,8.05,S,1,1,Mr


In [21]:
all_data['FamilySize'].unique()

array([ 2,  1,  5,  3,  7,  6,  4,  8, 11], dtype=int64)

In [22]:
all_data['Title'].unique()

array([&#39;Mr&#39;, &#39;Mrs&#39;, &#39;Miss&#39;, &#39;Master&#39;, &#39;Rare&#39;], dtype=object)

In [23]:
all_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 1309 entries, 1 to 1309
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      1309 non-null   int64  
 1   Sex         1309 non-null   object 
 2   Age         1046 non-null   float64
 3   Fare        1308 non-null   float64
 4   Embarked    1307 non-null   object 
 5   FamilySize  1309 non-null   int64  
 6   Is_alone    1309 non-null   int64  
 7   Title       1309 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 92.0+ KB


### Заполнение пропусков

In [24]:
# Среднее арифметическое для возраста
all_data['Age']  = all_data['Age'].fillna(value = all_data['Age'].mean())
all_data['Fare']  = all_data['Fare'].fillna(value = all_data['Fare'].median())

In [25]:
map_dict_embarked = {'S': 0 , 'C': 1, 'Q':2}
all_data['Embarked'] = all_data['Embarked'].map(map_dict_embarked)
all_data['Embarked']  = all_data['Embarked'].fillna(value = all_data['Embarked'].median())
all_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 1309 entries, 1 to 1309
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      1309 non-null   int64  
 1   Sex         1309 non-null   object 
 2   Age         1309 non-null   float64
 3   Fare        1309 non-null   float64
 4   Embarked    1309 non-null   float64
 5   FamilySize  1309 non-null   int64  
 6   Is_alone    1309 non-null   int64  
 7   Title       1309 non-null   object 
dtypes: float64(3), int64(3), object(2)
memory usage: 92.0+ KB


In [26]:
all_data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Is_alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,7.25,0.0,2,0,Mr
2,1,female,38.0,71.2833,1.0,2,0,Mrs
3,3,female,26.0,7.925,0.0,1,1,Miss
4,1,female,35.0,53.1,0.0,2,0,Mrs
5,3,male,35.0,8.05,0.0,1,1,Mr


### Нормирование

In [27]:
def my_scaler(df):
    return (df-df.min())/(df.max()-df.min())
nn_scaler =preprocessing.FunctionTransformer(my_scaler)

In [28]:
all_data['Age']=nn_scaler.fit_transform(all_data['Age'])
all_data['Fare']=nn_scaler.fit_transform(all_data['Fare'])

In [29]:
all_data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Is_alone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,0.273456,0.014151,0.0,2,0,Mr
2,1,female,0.473882,0.139136,1.0,2,0,Mrs
3,3,female,0.323563,0.015469,0.0,1,1,Miss
4,1,female,0.436302,0.103644,0.0,2,0,Mrs
5,3,male,0.436302,0.015713,0.0,1,1,Mr


### 1hot кодирование котегориальных переменных

In [30]:
res_data_all = pd.get_dummies(all_data,columns=['Pclass','Sex','Embarked','FamilySize','Title'])

In [31]:
res_data_all.head(10)

Unnamed: 0_level_0,Age,Fare,Is_alone,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_0.0,Embarked_1.0,...,FamilySize_5,FamilySize_6,FamilySize_7,FamilySize_8,FamilySize_11,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.273456,0.014151,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0.473882,0.139136,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0.323563,0.015469,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.436302,0.103644,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,0.436302,0.015713,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
6,0.37218,0.01651,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0.674308,0.101229,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
8,0.022924,0.041136,0,0,0,1,0,1,1,0,...,1,0,0,0,0,1,0,0,0,0
9,0.336089,0.021731,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
10,0.173243,0.058694,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [32]:
# Приготовим датасет для машинного обучения в разбивке как было дано изначально (tain, test)
X = res_data_all.values
x_train = X[:len(train_data)]
x_test = X[len(train_data):]
print(x_train.shape)
print(x_test.shape)

(891, 25)
(418, 25)


In [33]:
# Проверям соответствие количеству строк первоначальным данным:
print(train_data.shape)
print(test_data.shape)

(891, 10)
(418, 10)


In [34]:
# x_train,x_test готовы для моделей машинного обучения

#### Сервисные фунуции для моделей ML

In [35]:
# монитор динамического отображения процесса обучения (график пока не работает в колабе-баг!)
class acc_mon(keras.callbacks.Callback):
    def __init__(self,f,e_stop=20,a_stop=0.60):
        self.x=[]
        self.y1=[]
        self.y2=[]
        self.fw=f
        self.e_stop=e_stop
        self.a_stop=a_stop
    def on_train_begin(self, logs=None):
        print("Start training:")
        self.fw.add_scatter(x=self.x,y=self.y1,mode = 'lines',name = 'Train accuracy', line_color="green", showlegend = True)
        self.fw.add_scatter(x=self.x,y=self.y2,mode = 'lines',name = 'Validation accuracy', line_color="red", showlegend = True)
        self.fw.update_layout(title_text="Train and validation accuracy",
                  title_x = 0.45,
                  title_y= 0.9,
                  title_xanchor = "center",
                  title_yanchor = "bottom",template='gridon',
                  xaxis_title='Epoch', yaxis_title='Accuracy',width = 800, height = 500)
        scat1=self.fw.data[0]
        scat1.y=self.y1
        scat1.x=self.x
        scat2=self.fw.data[1]
        scat2.y=self.y2
        scat2.x=self.x
    def on_epoch_end(self, epoch, logs):
        if epoch > self.e_stop and logs.get('val_accuracy') < self.a_stop:
            print('Poor performance! Rebuild model!')
            print('-> epoch:',epoch,'acc:',round(logs['accuracy'],6),'val_acc:',round(logs['val_accuracy'],6))
            self.model.stop_training = True
        elif epoch > 0 and epoch % 10 == 0:
            print('-> epoch:',epoch,' ',end='')
            for i in logs.keys():
                print(i,":",round(logs[i],8),' ',end='')
            print('')
        print('.', end='')
        self.x.append(epoch)
        self.y1.append(logs['accuracy'])
        self.y2.append(logs['val_accuracy'])
        scat1=self.fw.data[0]
        scat1.y=self.y1
        scat1.x=self.x
        scat2=self.fw.data[1]
        scat2.y=self.y2
        scat2.x=self.x
 


In [36]:
# Callback для управления шагом обучения
learn_rate=keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=15)

In [37]:
# Callback для остановки обучения в случае если долго нет улучшений на проверочной выборке
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50,restore_best_weights=True)

In [38]:
def trainplot(hist): # функция строит графики ошибок
  fig = go.Figure()

  fig.add_trace(
    go.Scatter(x=hist['epoch'],y=hist['loss'],mode = 'lines',name = 'Train error', line_color="green", showlegend = True))
    
  fig.add_trace(
    go.Scatter(x=hist['epoch'],y=hist['val_loss'],mode = 'lines',name = 'Validation error', line_color="red", showlegend = True))

    
  fig.update_layout(title_text="Зависимость ошибок на проверочной и валидационной <br> выборках от эпохи обучения",
                  title_x = 0.5,
                  title_y= 0.9,
                  title_xanchor = "center",
                  title_yanchor = "bottom",
                  xaxis_title='Эпоха', yaxis_title='Ошибка',width = 800, height = 500)
  fig.show()

### Построение моделей ML

In [39]:
# Функция пересоздаёт пустую сеть
def createModel():

  # Создаем сеть
  model = Sequential()

  # Добавляем слои
  model.add(Dense(100, input_shape=(25,), activation='sigmoid'))
  model.add(Dense(50,  activation='sigmoid'))
  
  model.add(Dropout(0.25))

  model.add(Dense(25,  activation='sigmoid'))
  

  model.add(Dense(1,activation='sigmoid'))

  # Компилируем сеть
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model # Возвращаем созданную сеть

In [40]:
# Создаём пустую сеть
model = createModel()
model.summary()

Model: &quot;sequential&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2600      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 26        
Total params: 8,951
Trainable params: 8,951
Non-trainable params: 0
_________________________________________________________________


In [41]:
# Создаем виджет-монитор для отображения процесса обучения (пока не работает в колабе - работает на домашнем компьютере)
fw1 = go.FigureWidget()
fw1

FigureWidget({
    &#39;data&#39;: [], &#39;layout&#39;: {&#39;template&#39;: &#39;...&#39;}
})

In [42]:
# Обучаем сеть
callbacks_list=[learn_rate,early_stop,acc_mon(fw1)]
report=model.fit(x_train, 
          y_train,
          batch_size=100, 
          epochs=1000,
          validation_split=0.2,
          verbose=0,callbacks=callbacks_list)

Start training:
..........-&gt; epoch: 10  loss : 0.64121348  accuracy : 0.6320225  val_loss : 0.61602509  val_accuracy : 0.64245808  lr : 0.001  
..........-&gt; epoch: 20  loss : 0.49813625  accuracy : 0.79634833  val_loss : 0.44466162  val_accuracy : 0.82122904  lr : 0.001  
..........-&gt; epoch: 30  loss : 0.45055455  accuracy : 0.81320226  val_loss : 0.39400634  val_accuracy : 0.83798885  lr : 0.001  
..........-&gt; epoch: 40  loss : 0.43544114  accuracy : 0.81741571  val_loss : 0.3776333  val_accuracy : 0.83798885  lr : 0.001  
..........-&gt; epoch: 50  loss : 0.42301053  accuracy : 0.81741571  val_loss : 0.37099916  val_accuracy : 0.83798885  lr : 0.001  
..........-&gt; epoch: 60  loss : 0.41912362  accuracy : 0.82162923  val_loss : 0.36052766  val_accuracy : 0.84357542  lr : 0.001  
..........-&gt; epoch: 70  loss : 0.41923133  accuracy : 0.81460673  val_loss : 0.35920033  val_accuracy : 0.84916198  lr : 0.001  
..........-&gt; epoch: 80  loss : 0.41911423  accuracy : 0.825

In [43]:
df_report = pd.DataFrame(report.history) # Сохраняем историю обучения в датафрейм
df_report['epoch'] = report.epoch
df_report.tail()

Unnamed: 0,loss,accuracy,val_loss,val_accuracy,lr,epoch
172,0.402883,0.83427,0.344011,0.865922,1e-06,172
173,0.404267,0.837079,0.344013,0.865922,1e-06,173
174,0.398499,0.832865,0.344016,0.865922,1e-06,174
175,0.404728,0.827247,0.344016,0.865922,1e-06,175
176,0.401742,0.831461,0.344016,0.865922,1e-06,176


In [44]:
# выводим график обучения модели
trainplot(df_report)

In [45]:
y_predict = model.predict(x_test)

In [46]:
y_compet = (y_predict > 0.5).astype('int64').reshape((len(y_predict),))

In [47]:
y_compet

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [48]:
# Участие в конкурсе
pass_id = all_data[len_train_data:].index.values
submit = pd.DataFrame({'PassengerId':pass_id,'Survived':y_compet})
#creating submission file
filename = 'team_submission.csv'
submit.to_csv(filename,index=False)
print('Saved file: ' + filename)

Saved file: team_submission.csv


In [49]:
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [50]:
# Отправка прогноза на kaggle
!kaggle competitions submit -c titanic -f team_submission.csv -m "test submission."

Successfully submitted to Titanic: Machine Learning from Disaster
  0%|          | 0.00/3.18k [00:00&lt;?, ?B/s]
100%|██████████| 3.18k/3.18k [00:00&lt;00:00, 19.3kB/s]
100%|██████████| 3.18k/3.18k [00:05&lt;00:00, 566B/s]  



Точность предсказания на kaggle составила: 0.77272