# Introduction #

Доброго дня!

Меня зовут **Васюхин Артём**, я студент XV когорты курса DS+ "Яндекс.Практикум" и в этом проекте мы рассмотрим данные о сердечно-сосудистых заболеваниях (далее - ССЗ), которые продолжают оставаться главной причиной смертности во всём мире.

**Наша задача** - создать модель, которая бы на основе данных анализов предсказывала вероятность возникновения ССЗ у пациента.

## Import libraries and previous data checking ##

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.utils import shuffle

from pickle import dump, load

In [None]:
test_data = pd.read_csv('test_hdp.csv')
train_data = pd.read_csv('train_hdp.csv')
subsample = pd.read_csv('sample_submission.csv')

display(train_data.head())
display(test_data.head())

In [None]:
train_data.hist(figsize=(15,20));

# EDA #

In [None]:
train_data.info()
train_data.head()

In [None]:
train_data['weight'] = train_data['weight'].astype('int64')
train_data.head()

In [None]:
train_data.boxplot(figsize=(15,6));

## id ##

In [None]:
# удалим ненужную колонку id
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

display(train_data.head(5))
display(test_data.head(5))

## age ##

In [None]:
def boxplot_col(column):
    train_data.boxplot(column, figsize=(10, 5));
    return boxplot_col

In [None]:
boxplot_col('age');

In [None]:
train_data.query('age < 12000')

In [None]:
train_data = train_data.query('age > 12000')

In [None]:
train_data['age'] = (train_data['age'] / 365).astype('int')
test_data['age'] = (test_data['age'] / 365).astype('int')
train_data

In [None]:
boxplot_col('age')

In [None]:
train_data.boxplot(figsize=(10,8))

## gender ##

In [None]:
train_data['gender'] = train_data['gender'].replace([1, 2], [0, 1])

In [None]:
train_data.head(20)

## ap_hi ##

In [None]:
boxplot_col('ap_hi');

In [None]:
train_data.query('ap_hi > 300')

In [None]:
train_data.query('ap_hi < 300').sort_values(by='ap_hi', ascending=False)

In [None]:
train_data['ap_hi'] = train_data['ap_hi'].abs()
test_data['ap_hi'] = test_data['ap_hi'].abs()

train_data.query('ap_hi <= 0')

In [None]:
train_data.query('ap_hi < 300').sort_values(by='ap_hi', ascending=False)

In [None]:
train_data.query('ap_hi < 50').sort_values(by='ap_hi', ascending=False)

In [None]:
# удаляем строки с неподдающимися анализу значениями ap_lo
train_data['ap_hi'] = train_data['ap_hi'].drop([8757, 42334, 5382], axis=0)

# домножим значения <50 на 10
train_data.loc[train_data['ap_hi'] < 50, 'ap_hi'] = train_data['ap_hi'] * 10
test_data.loc[test_data['ap_hi'] < 50, 'ap_hi'] = test_data['ap_hi'] * 10

# зададим условие, при котором строки с ap_hi <40 не будут учитываться
train_data = train_data.query('ap_hi > 40')


# переведем значения в целочисленные
train_data['ap_hi'] = train_data['ap_hi'].astype('int64')
test_data['ap_hi'] = test_data['ap_hi'].astype('int64')

In [None]:
# убираем строки с ap_hi > 210
train_data = train_data.query('ap_hi < 210')

In [None]:
boxplot_col('ap_hi');

In [None]:
train_data.sort_values(by='ap_hi', ascending=False).tail(15)

**Ремарка**: здесь мы видим, что некоторые значения в ap_hi меньше, чем в ap_lo. Предположим, что они перепутаны местами, поэтому вернёмся к ним позже - после работы с колонкой ap_lo.

### Версия 2 ###

In [None]:
train_data['ap_hi'].describe()

In [None]:
train_data = train_data.query('ap_hi < 250 & ap_hi >= 40')
boxplot_col('ap_hi');

## ap_lo ##

In [None]:
train_data['ap_lo'].describe()

In [None]:
train_data.query('ap_lo >= 2000')
boxplot_col('ap_lo');

In [None]:
train_data.loc[train_data['ap_lo'] > 500, 'ap_lo'] = train_data['ap_lo'] / 100
test_data.loc[test_data['ap_lo'] > 500, 'ap_lo'] = test_data['ap_lo'] / 100

train_data.query('ap_lo > 500')

In [None]:
boxplot_col('ap_lo')

In [None]:
train_data.query('ap_lo < 0')

In [None]:
train_data['ap_lo'] = train_data['ap_lo'].abs()
test_data['ap_lo'] = test_data['ap_lo'].abs()

In [None]:
train_data.query('ap_lo == 0').sort_values(by='ap_lo', ascending=False)

In [None]:
train_data = train_data.query('ap_lo > 0')

In [None]:
train_data.query('ap_lo <= 15').sort_values(by='ap_lo', ascending=False)

In [None]:
train_data.loc[train_data['ap_lo'] <= 21, 'ap_lo'] = train_data['ap_lo'] * 10
test_data.loc[test_data['ap_lo'] <= 21, 'ap_lo'] = test_data['ap_lo'] * 10

train_data.query('ap_lo <= 15').sort_values(by='ap_lo', ascending=False)

In [None]:
train_data.query('ap_lo <= 50').sort_values(by='ap_lo', ascending=False)

In [None]:
train_data = train_data.query('ap_lo >= 40')

train_data.query('ap_lo <= 50').sort_values(by = 'ap_lo', ascending=False)

In [None]:
train_data.loc[
    (train_data['ap_lo'] <= 50) & 
    (train_data['ap_hi'] > 100), 
    'ap_lo'] = train_data['ap_lo'].drop(train_data.index, axis=0)

In [None]:
train_data.query('ap_lo <= 50').sort_values(by = 'ap_lo', ascending=False)

**Теперь вернёмся к нашей гипотезе, при которой в некоторых случаях значения ap_hi и ap_lo перепутаны.**

In [None]:
train_data.query('ap_hi < ap_lo')

**Пофиксить**

In [None]:
# вар.1: удаляем строки, где ap_lo > ap_hi
train_data = train_data.query('ap_hi > ap_lo')

In [None]:
boxplot_col('ap_lo')

In [None]:
train_data = train_data.query('ap_lo <= 140')

In [None]:
train_data.boxplot(figsize=(10,8));

### Версия 2 ###

In [None]:
train_data['ap_lo'].describe()

In [None]:
train_data = train_data.query('ap_lo >= 20 & ap_lo <= 200')

boxplot_col('ap_lo')

## height ##

In [None]:
boxplot_col('height')

In [None]:
train_data.query('height > 215 or height < 100').sort_values(by='height')

In [None]:
train_data = train_data.query('height > 120 & height < 225')

In [None]:
boxplot_col('height')

In [None]:
train_data['ap_lo'] = train_data['ap_lo'].astype('int64')
train_data.head()

### Версия 2 ###

In [None]:
train_data = train_data.query('height >= 120 & height <= 215')
boxplot_col('height')

## weight ##

In [None]:
train_data.info()
boxplot_col('weight')

In [None]:
train_data.query('weight >= 150 & cardio == 0')

In [None]:
train_data.info()

In [None]:
train_data.loc[
    (train_data['weight'] >= 150) & 
    (train_data['cardio'] == 0)] = train_data.drop(train_data.index, axis=0)

In [None]:
train_data.info()

In [None]:
train_data.query('weight >= 150 & cardio == 0')

In [None]:
boxplot_col('weight')

In [None]:
train_data.query('weight < 35')

In [None]:
train_data.loc[
    (train_data['height'] >= 160) & 
    (train_data['weight'] < 45)] = train_data.drop(train_data.index, axis=0)
train_data

In [None]:
train_data.info()

In [None]:
train_data = train_data.dropna()

In [None]:
train_data[['age','gender',#'height',
            'weight','ap_hi',#'ap_lo',
            'cholesterol','gluc','smoke',
            'alco','active','cardio']] = \
train_data[['age','gender',#'height',
            'weight','ap_hi',#'ap_lo',
            'cholesterol','gluc','smoke',
            'alco','active','cardio']].astype('int64')

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.boxplot(figsize=(10,8))

In [None]:
test_data.head()

### Версия 2 ###

In [None]:
train_data = train_data.query('weight >= 50 & weight <= 105')
boxplot_col('weight')

## total_eda ###

In [None]:
display(train_data.head())
display(test_data)

In [None]:
train_data.boxplot(figsize=(10,8))

# Modeling #

In [None]:
features_train = train_data.drop('cardio', axis=1)
target_train = train_data['cardio']

RANDOM_SCORE = 69

In [None]:
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)

In [None]:
with open("D:/workplace/data_science/pycharm/mvp_workshop/xgb_grid_clf.pcl", 'wb') as fid:
    dump(grid, fid)

In [None]:
open = load(open(r'D:\workplace\data_science\pycharm\mvp_workshop\xgb_grid_clf.pcl', 'rb'))
open

In [None]:
with open(r'D:\workplace\data_science\pycharm\mvp_workshop\xgb_grid_clf.pcl', 'wb') as fid:
    load(fid)

In [None]:
with open(r'D:\workplace\data_science\pycharm\mvp_workshop\xgb_grid_clf.pcl', 'wb') as fid:
    load(fid)