# Dataset cleansing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 1. Load the (standardized) Data

In [2]:
patients = pd.read_csv("../../Data/standardized_patients.csv",index_col=0, header=0)
patients.dropna(inplace=True)
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,-1.029482,-1.95799,-0.197364,-2.200243,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,-0.788571,-0.743995,1.49688,-0.726827,-0.611945,-0.168681,0
1,0.971362,1.145866,0.382817,0.302016,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,1.643058,0.971448,-0.784237,-0.058927,-0.42517,-0.538945,0
2,-1.029482,0.276786,-1.102701,-1.221098,-0.037838,1.436838,2.362657,-0.80864,-0.08895,-0.134216,0.245512,-0.66058,-1.477415,0.608973,0.010639,-0.538945,0
3,-1.029482,0.897557,0.30631,-0.459541,-0.037838,1.436838,-0.423252,-0.80864,-0.08895,-0.134216,1.668693,0.358531,-0.54529,0.519919,2.525877,2.052902,0
4,-1.029482,0.40094,-1.274843,-0.459541,-0.037838,1.436838,-0.423252,-0.80864,-0.08895,-0.134216,-0.605467,-0.290654,-0.178603,-0.726827,-0.611945,-0.538945,0


In [3]:
patients = patients.sample(frac=1).reset_index(drop=True)

## 2. Create the X and y

In [4]:
X = patients.copy()
X.drop("Class", axis=1, inplace=True)
print(X.shape)

y = patients[['Class']].copy()
print(y.shape)

(153884, 16)
(153884, 1)


## 3. Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [5]:
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.25, random_state=42)

In [6]:
y_val[y_val['Class']==1].shape

(56, 1)

In [7]:
y_train[y_train['Class']==1].shape

(161, 1)

In [8]:
y_test[y_test['Class']==1].shape

(61, 1)

##Â 4. Save to Files

In [9]:
patients.to_csv("../../Data/all_patients/standardized_patients.csv")

X_train.to_csv("../../Data/all_patients/X_train_total.csv")
y_train.to_csv("../../Data/all_patients/y_train_total.csv")

X_test.to_csv("../../Data/all_patients/X_test.csv")
y_test.to_csv("../../Data/all_patients/y_test.csv")

X_val.to_csv("../../Data/all_patients/X_val.csv")
y_val.to_csv("../../Data/all_patients/y_val.csv")