In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [90]:
import seaborn as sns
import io
import requests
import re
import warnings
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-notebook')

## ОБРАБОТКА ДАННЫХ

In [2]:
df_train = pd.read_csv('../input/titanic-machine-learning-from-disaster/train.csv')
df_train = df_train.set_index('PassengerId')
df_test = pd.read_csv('../input/titanic-machine-learning-from-disaster/test.csv')
df_test = df_test.set_index('PassengerId')

In [87]:
women = df_train.loc[df_train.Sex == 'female']["Survived"]
rate_women = round(100 * sum(women)/len(women), 1)

print("% of women survived:", rate_women)

men = df_train.loc[df_train.Sex == 'male']["Survived"]
rate_men = round(100 * sum(men)/len(men), 1)

print("% of men survived:", rate_men)

In [93]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 8))
women = df_train[df_train['Sex']=='female']
men = df_train[df_train['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = 'survived', ax = axes[0], kde =False, color="green")
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = 'not_survived', ax = axes[0], kde =False, color="red")
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = 'survived', ax = axes[1], kde = False, color="green")
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = 'not_survived', ax = axes[1], kde = False, color="red")
ax.legend()
_ = ax.set_title('Male');

In [3]:
#ОСТАВЛЯЕМ СТОЛБЦЫ, С КОТОРЫМИ БУДЕМ РАБОТАТЬ

target = ['Survived']
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked']
feature_cols = num_cols + cat_cols
cols = feature_cols + target

def create_data(df):
    data = df[num_cols + cat_cols + target]
    return data

data = create_data(df=df_train)

In [4]:
#Выявление столбцов с пропусками в данных. 

print('Пропуски данных в столбцах:')
for i in cols:
    print(i, data[i].count())

In [5]:
#Выявлены пропуски в столбцах с информацией о возрасте и порте отправления. 
#Заменяю пропуски данных о возрасте средним значением по столбцу. 
#Строки с пропусками данных о порте отправления удаляю, их всего две.

mean_age = data['Age'].mean(axis=0)
data['Age'] = data['Age'].fillna(mean_age)
data.dropna(axis=0)

In [6]:
#Создание матрицы признаков и выделение целевой переменной

def create_features(df):
    y = df[target]
    y = np.array(y).ravel()
    X = df[num_cols + cat_cols]
    return X, y

X, y = create_features(df=data)
X.head()

## 1. ГРАДИЕНТНЫЙ БУСТИНГ ##

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [8]:
#one-hot-encoding

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [9]:
gbc = GradientBoostingClassifier(random_state=2)

In [18]:
#сетка гиперпараметров для поиска лучших

params = {'n_estimators': (7, 10, 15, 20, 30, 50),
             'max_depth': (3, 5, 6, 7, 8, 10)}

#ищем по сетке

grid = GridSearchCV(gbc, params, cv=5)
grid.fit(X, y)
grid.best_params_

In [19]:
#Обучаем модель с найденными параметрами:

gbc_best = GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=2)
gbc_best.fit(X, y)

#Кросс-валидация по 5 фолдам

scores = cross_val_score(gbc_best, X, y, cv=5, scoring="accuracy")
print(scores.mean())
print ('ROC AUC: %0.3f' % gbc_best.score(X, y) )

In [14]:
'''Pred_gbc_best_test = gbc_best.predict(X_test)
Pred_gbc_best_train = gbc_best.predict(X_train)
roc_auc_test = roc_auc_score(y_test, Pred_gbc_best_test)
roc_auc_train = roc_auc_score(y_train, Pred_gbc_best_train)
print('значение roc_auc на test: ', round(roc_auc_test, 4))
print('значение roc_auc на train: ', round(roc_auc_train, 4))
print('среднее значение roc_auc: ', round((roc_auc_test + roc_auc_train)/2, 4))'''

# ПРЕДСКАЗАНИЯ ДЛЯ ГРАДИЕНТНЫХ БУСТИНГОВ


In [20]:
best_model = gbc_best

In [21]:
X_test = df_test[num_cols + cat_cols]
X_test['Age'] = X_test['Age'].fillna(mean_age)
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors='coerce')
X_test = X_test.fillna(0)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True) #для случайного леса
#X_test_test = cat_to_int(X_test_test, cat_cols) #для кэтбуста

In [22]:
submission_gbc = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_gbc['PassengerId'] = X_test.index
submission_gbc['Survived'] = best_model.predict(X_test)
submission_gbc.to_csv('./submission_gbc.csv', index=False)

## 2.ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import roc_auc_score

In [24]:
#масштабируем признаки

pca = StandardScaler()
X_ = pca.fit_transform(X)

In [25]:
# Значения коэффициента С: 100 значений в промежутке от 0.001 до 1000

a = np.linspace(0.001, 1000, 100)
logregcv = LogisticRegressionCV(Cs=a , fit_intercept=True, cv=5, scoring='accuracy', random_state=2, refit=True, max_iter=500)

In [26]:
logregcv.fit(X_, y)

In [None]:
#Лучший коэффициент с

c_best = logregcv.C_
c_best = np.max(c_best)
c_best

In [33]:
logreg_best = LogisticRegression(C=c_best, fit_intercept=True, random_state=2, max_iter=500)

In [44]:
#кросс-валидация для лог. регрессии, С = 10.102:

import time
import datetime

start_time = datetime.datetime.now()

#Кросс-валидация по 5 фолдам

scores = cross_val_score(logreg_best, X, y, cv=5, scoring="accuracy")
print(scores.mean())
print ('ROC AUC: %0.3f' % logreg_best.score(X, y) )

print('Time elapsed:', datetime.datetime.now() - start_time)

In [35]:
logreg_best.fit(X_, y)

## ПРЕДСКАЗАНИЯ ДЛЯ ЛОГ. РЕГРЕССИЙ


In [36]:
best_model = logreg_best

In [37]:
X_test = df_test[num_cols + cat_cols]
X_test['Age'] = X_test['Age'].fillna(mean_age)
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors='coerce')
X_test = X_test.fillna(0)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True) #для случайного леса
X_test_ = pca.fit_transform(X_test)
#X_test_test = cat_to_int(X_test_test, cat_cols) #для кэтбуста

In [38]:
submission_logreg = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_logreg['PassengerId'] = X_test.index
submission_logreg['Survived'] = best_model.predict(X_test_)
submission_logreg.to_csv('./submission_logreg.csv', index=False)

## 3. МЕТОД БЛИЖАЙШИХ СОСЕДЕЙ

In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [49]:
pca = StandardScaler()
X_ = pca.fit_transform(X)

In [57]:
# Параметры

params = {'n_neighbors': (10, 14, 20, 30),
         'weights': ('distance', 'uniform')}
knnmodel = KNeighborsClassifier(n_jobs = -1)

In [58]:
#ищем по сетке
grid_knn = GridSearchCV(knnmodel, params, cv=5)
grid_knn.fit(X_, y)
grid_knn.best_params_

In [59]:
#Модель с найденными гиперпараметрами
knnmodel_best = KNeighborsClassifier(n_jobs = -1, n_neighbors=14, weights='uniform')
knnmodel_best.fit(X, y)

In [60]:
#Кросс-валидация по 5 фолдам

scores = cross_val_score(knnmodel_best, X, y, cv=5, scoring="accuracy")
print(scores.mean())
print ('ROC AUC: %0.3f' % knnmodel_best.score(X, y) )

## ПРЕДСКАЗАНИЕ ДЛЯ KNN

In [61]:
best_model = knnmodel_best

In [62]:
X_test = df_test[num_cols + cat_cols]
X_test['Age'] = X_test['Age'].fillna(mean_age)
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors='coerce')
X_test = X_test.fillna(0)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True) 
X_test_ = pca.fit_transform(X_test)

In [64]:
submission_knn = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_knn['PassengerId'] = X_test.index
submission_knn['Survived'] = best_model.predict(X_test_)
submission_knn.to_csv('./submission_knn.csv', index=False)

## 4. МЕТОД ОПОРНЫХ ВЕКТОРОВ

In [65]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler

In [66]:
pca = StandardScaler()
X_ = pca.fit_transform(X)

In [75]:
# Параметры

params = {'C': (.01, .1, .5, .75, 1, 2, 3, 5, 10),
         'kernel': ('poly', 'rbf'),
         'degree': (1, 2, 3, 4)}
svcmodel = SVC()

In [76]:
#ищем по сетке
grid_svc = GridSearchCV(svcmodel, params, cv=5)
grid_svc.fit(X_, y)
grid_svc.best_params_

In [77]:
#Модель с найденными гиперпараметрами
svcmodel_best = SVC(C=1, kernel='rbf', degree=1)
svcmodel_best.fit(X, y)

In [78]:
#Кросс-валидация по 5 фолдам

scores = cross_val_score(svcmodel_best, X, y, cv=5, scoring="accuracy")
print(scores.mean())
print ('ROC AUC: %0.3f' % svcmodel_best.score(X, y) )

## ПРЕДСКАЗАНИЯ ДЛЯ SCV

In [80]:
best_model = svcmodel_best

In [81]:
X_test = df_test[num_cols + cat_cols]
X_test['Age'] = X_test['Age'].fillna(mean_age)
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors='coerce')
X_test = X_test.fillna(0)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True) 
X_test_ = pca.fit_transform(X_test)

In [84]:
submission_scv = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_scv['PassengerId'] = X_test.index
submission_scv['Survived'] = best_model.predict(X_test_)
submission_scv.to_csv('./submission_scv.csv', index=False)