In [None]:
#!pip freeze > requirements.txt

In [None]:
!pip install -r requirements.txt

# Preparazione del dataset

## Questo notebook ha il seguente obiettivo:  

a partire dal file di dati input ***heart_failure_clinical_records_dataset.train.csv***  

creare e scrivere su disco due file CSV: ***train.scv*** e ***test.csv***, che dovranno contenere i rispettivi dati di train e test che verranno utilizzati nel notebook di training

### Librerie utilizzate

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

Carichiamo il dataset

In [2]:
clinical_data = pd.read_csv('heart_failure_clinical_records_dataset.train.csv')

### Analisi preliminare del dataset

In [3]:
clinical_data.describe()

Unnamed: 0,DEATH_EVENT,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
count,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0
mean,0.297071,61.059975,0.435146,577.083682,0.405858,38.263598,0.351464,262427.450209,1.387364,136.41841,0.644351,0.313808,132.297071
std,0.457927,11.999013,0.496817,946.937475,0.492088,11.651161,0.47843,94272.495618,0.994975,4.440152,0.479714,0.465013,78.350857
min,0.0,40.0,0.0,23.0,0.0,15.0,0.0,47000.0,0.5,113.0,0.0,0.0,4.0
25%,0.0,52.0,0.0,111.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.5
50%,0.0,60.0,0.0,245.0,0.0,38.0,0.0,263000.0,1.1,137.0,1.0,0.0,119.0
75%,1.0,69.5,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,139.0,1.0,1.0,205.5
max,1.0,95.0,1.0,7861.0,1.0,80.0,1.0,742000.0,9.0,146.0,1.0,1.0,285.0


In [4]:
clinical_data.head()

Unnamed: 0,DEATH_EVENT,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,1,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10
1,1,75.0,0,99,0,38,1,224000.0,2.5,134,1,0,162
2,1,60.667,1,104,1,30,0,389000.0,1.5,136,1,0,171
3,0,52.0,0,132,0,30,0,218000.0,0.7,136,1,1,112
4,1,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,27


In [5]:
clinical_data.tail()

Unnamed: 0,DEATH_EVENT,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
234,0,60.667,1,151,1,40,1,201000.0,1.0,136,0,0,172
235,0,58.0,0,582,1,35,0,122000.0,0.9,139,1,1,71
236,0,55.0,0,748,0,45,0,263000.0,1.3,137,1,0,88
237,0,44.0,0,582,1,30,1,263358.03,1.6,130,1,1,244
238,0,80.0,0,898,0,25,0,149000.0,1.1,144,1,1,87


In [6]:
clinical_data.shape

(239, 13)

#### Controlliamo se ci sono valori nulli

In [7]:
clinical_data.isnull().any().any()

False

In [8]:
clinical_data['DEATH_EVENT'].value_counts()

0    168
1     71
Name: DEATH_EVENT, dtype: int64

In [9]:
clinical_data.isnull().any()

DEATH_EVENT                 False
age                         False
anaemia                     False
creatinine_phosphokinase    False
diabetes                    False
ejection_fraction           False
high_blood_pressure         False
platelets                   False
serum_creatinine            False
serum_sodium                False
sex                         False
smoking                     False
time                        False
dtype: bool

#### Controlliamo quante distinte classi ci sono per ciascuna feature

In [10]:
clinical_data.nunique()

DEATH_EVENT                   2
age                          47
anaemia                       2
creatinine_phosphokinase    175
diabetes                      2
ejection_fraction            16
high_blood_pressure           2
platelets                   153
serum_creatinine             38
serum_sodium                 26
sex                           2
smoking                       2
time                        132
dtype: int64

In [11]:
clinical_data.dtypes

DEATH_EVENT                   int64
age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
dtype: object

### Split del Dataset

#### Individuiamo target e feature

In [12]:
y=clinical_data['DEATH_EVENT']
X=clinical_data.drop('DEATH_EVENT', axis=1)

#### Split dei dati

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### Riconcateniamo tutti i dati con target e feature

In [14]:
training_set = pd.concat([X_train,y_train], axis=1)
test_set = pd.concat([X_test,y_test], axis=1)

#### Procediamo ora con la creazione e la scrittura su disco dei file train.csv e test.csv

In [15]:
training_set.to_csv('train.csv', index=False)

In [16]:
test_set.to_csv('test.csv', index=False)