In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set();

## Задаем пути

In [3]:
PROJECT_FOLDER: str = 'P:\\Python Projects\\EDA_cardio'
DATASET_NAME: str = 'cardio_train.csv'
DATA_PATH: str = os.path.join(PROJECT_FOLDER, 'src', 'raw', DATASET_NAME)

RESULT_PATH: str = os.path.join(PROJECT_FOLDER, 'src', 'visualizations')

## Читаем

In [4]:
data = pd.read_csv(os.path.join(DATA_PATH), sep=';')

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,70000.0,49972.4199,28851.302323,0.0,25006.75,50001.5,74889.25,99999.0
age,70000.0,19468.865814,2467.251667,10798.0,17664.0,19703.0,21327.0,23713.0
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,164.359229,8.210126,55.0,159.0,165.0,170.0,250.0
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0


# Фильтрация датафрейма. Удаление строк и столбцов

In [10]:
# Удалим целевой признак cardio
dummy_df = data.drop('cardio', axis=1)
dummy_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


In [7]:
# Удалим первые 100 пациентов
dummy_df = data.drop(np.arange(100), axis=0)
dummy_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
100,135,14684,2,171,61.0,110,70,1,1,0,0,1,0
101,136,18718,1,167,80.0,190,90,2,1,0,1,0,0
102,137,17015,1,168,77.0,100,70,1,1,0,0,1,0
103,138,18128,2,183,95.0,125,80,1,3,1,0,1,0
104,139,21903,1,170,72.0,120,80,1,1,0,0,0,1


In [9]:
# Удалим всех пацентов с ростом ниже 125 см, а также выше 200 см
dummy_df = data.drop(data[(data['height'] < 125) | (data['height'] > 200)].index)
dummy_df.shape[0] / data.shape[0]

0.9987285714285714

# Добавление новых признаков

In [11]:
data['height_cm'] = data['height'] / 100
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,height_cm
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,1.68
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,1.56
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,1.65
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,1.69
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,1.56


# Перекодировка значений признаков

In [12]:
new_values = {1:'low', 2:'normal', 3:'high'} # обычный словарь Python
data['dummy_cholesterol'] = data['cholesterol'].map(new_values)
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,height_cm,dummy_cholesterol
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,1.68,low
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,1.56,high
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,1.65,high
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,1.69,low
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,1.56,low


In [13]:
data['cardio'] = data['cardio'].astype(bool)
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,height_cm,dummy_cholesterol
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,False,1.68,low
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,True,1.56,high
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,True,1.65,high
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,True,1.69,low
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,False,1.56,low
