# Dataset cleansing

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 1. Load the (standardized) Data

In [23]:
patients = pd.read_csv("../Data/standardized_patients.csv",index_col=0, header=0)
patients.dropna(inplace=True)
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,0.971362,0.276786,-0.012471,-0.785922,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,1.891623,-0.816529,-0.17696,-0.326087,2.376457,-0.538945,0
1,-1.029482,-0.219831,0.210675,-1.112304,-0.037838,1.436838,-0.423252,-0.80864,-0.08895,-0.134216,-0.091858,-0.812902,-1.312412,0.208233,-0.088975,-0.353813,0
2,-1.029482,0.40094,-0.796672,-0.350747,-0.037838,1.436838,2.362657,-0.80864,-0.08895,-0.134216,-0.788571,0.757471,-0.696793,-0.125717,-0.462525,-0.477234,0
3,0.971362,0.40094,0.210675,-0.024365,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,1.302484,-0.580792,-1.217106,-0.058927,0.284575,-0.538945,0
4,0.971362,-0.592293,-0.165486,-0.133159,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,-0.465849,-0.022276,-0.878177,0.675763,-0.088975,-0.292102,0


In [24]:
patients = patients.sample(frac=1).reset_index(drop=True)

## 2. Create the X and y

In [25]:
X = patients.copy()
X.drop("Class", axis=1, inplace=True)
print(X.shape)

y = patients[['Class']].copy()
print(y.shape)

(153884, 16)
(153884, 1)


## 3. Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [26]:
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.25, random_state=42)

In [27]:
y_val[y_val['Class']==1].shape

(53, 1)

In [28]:
y_train[y_train['Class']==1].shape

(172, 1)

In [29]:
y_test[y_test['Class']==1].shape

(53, 1)

## 4. Save to Files

In [30]:
patients.to_csv("../Data/standardized_patients.csv")

X_train.to_csv("../Data/X_train_total.csv")
y_train.to_csv("../Data/y_train_total.csv")

X_test.to_csv("../Data/X_test.csv")
y_test.to_csv("../Data/y_test.csv")

X_val.to_csv("../Data/X_val.csv")
y_val.to_csv("../Data/y_val.csv")