# Dataset cleansing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 1. Load the (standardized) Data

In [2]:
patients = pd.read_csv("../../Data/standardized_patients.csv",index_col=0, header=0)
patients.dropna(inplace=True)
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,0.971362,1.642483,1.09051,0.628398,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,-0.788571,-0.762128,-1.685343,-0.415141,-0.611945,-0.538945,0
1,-1.029482,-1.833836,-0.873179,0.084429,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,-0.788571,0.1373,-1.133682,0.275023,0.17251,-0.168681,0
2,0.971362,1.518328,0.408319,0.302016,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,2.536151,-0.265266,0.679982,-0.337219,-0.300653,-0.261247,0
3,-1.029482,-1.08891,1.581432,0.519604,-0.037838,-0.695973,-0.423252,2.330176,-0.08895,-0.134216,0.605313,2.730411,-0.870239,-0.058927,-0.611945,-0.538945,0
4,0.971362,0.028478,0.440197,1.063574,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,0.733944,0.159061,-0.610133,0.208233,1.479936,-0.168681,0


In [3]:
patients = patients.sample(frac=1).reset_index(drop=True)

## 2. Create the two dataset from the t-SNE analysis

### 2.1 Women in menopause condition

In [4]:
women_menopause = patients[patients['menopause']>1]

In [5]:
women_menopause.drop(columns=['sex','menopause'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [6]:
women_menopause.head()

Unnamed: 0,age,weight,height,HIPX,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
2,0.276786,-1.102701,-1.221098,-0.037838,2.362657,-0.80864,-0.08895,-0.134216,0.245512,-0.66058,-1.477415,0.608973,0.010639,-0.538945,0
3,0.897557,0.30631,-0.459541,-0.037838,-0.423252,-0.80864,-0.08895,-0.134216,1.668693,0.358531,-0.54529,0.519919,2.525877,2.052902,0
4,0.40094,-1.274843,-0.459541,-0.037838,-0.423252,-0.80864,-0.08895,-0.134216,-0.605467,-0.290654,-0.178603,-0.726827,-0.611945,-0.538945,0
6,-0.095676,-1.096326,-1.112304,-0.037838,-0.423252,0.760768,-0.08895,-0.134216,-0.376586,-0.602552,-0.288495,-0.259297,-0.462525,-0.538945,0
9,1.145866,-0.650033,-1.645394,-0.037838,-0.423252,-0.80864,-0.08895,-0.134216,-0.430144,-0.48287,-0.420179,-0.259297,-0.313105,-0.168681,0


#### Create the X and y

In [7]:
women_X = women_menopause.copy()
women_X.drop("Class", axis=1, inplace=True)
print(women_X.shape)

women_y = women_menopause[['Class']].copy()
print(women_y.shape)

(50215, 14)
(50215, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [8]:
women_X_train_and_val, women_X_test, women_y_train_and_val, women_y_test = train_test_split(women_X, women_y, test_size=0.2, random_state=42)
women_X_train, women_X_val, women_y_train, women_y_val = train_test_split(women_X_train_and_val, women_y_train_and_val, test_size=0.25, random_state=42)

In [9]:
women_y_val[women_y_val['Class']==1].shape

(31, 1)

In [10]:
women_y_train[women_y_train['Class']==1].shape

(66, 1)

In [11]:
women_y_test[women_y_test['Class']==1].shape

(31, 1)

#### Save to Files

In [12]:
women_X_train.to_csv("../../Data/women_menopause/X_train_total.csv")
women_y_train.to_csv("../../Data/women_menopause/y_train_total.csv")

women_X_test.to_csv("../../Data/women_menopause/X_test.csv")
women_y_test.to_csv("../../Data/women_menopause/y_test.csv")

women_X_val.to_csv("../../Data/women_menopause/X_val.csv")
women_y_val.to_csv("../../Data/women_menopause/y_val.csv")

### 2.2 Other patients

In [13]:
other_patients = patients[patients['menopause']<1]

In [14]:
other_patients.drop(columns=['menopause','HRT'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [15]:
other_patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,-1.029482,-1.95799,-0.197364,-2.200243,-0.037838,0.760768,-0.08895,-0.134216,-0.788571,-0.743995,1.49688,-0.726827,-0.611945,-0.168681,0
1,0.971362,1.145866,0.382817,0.302016,-0.037838,0.760768,-0.08895,-0.134216,1.643058,0.971448,-0.784237,-0.058927,-0.42517,-0.538945,0
5,-1.029482,-1.08891,-2.039916,-1.166701,-0.037838,2.330176,-0.08895,-0.134216,-0.788571,-0.986985,-0.803449,0.208233,-0.462525,-0.353813,0
7,0.971362,0.152632,0.236178,-0.568335,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,-0.631566,-1.084209,-0.103454,-0.462525,-0.538945,1
8,0.971362,-1.95799,-0.414135,0.737192,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,-0.747621,0.59987,-0.715696,-0.512332,-0.292102,0


#### Create the X and y

In [16]:
other_patients_X = other_patients.copy()
other_patients_X.drop("Class", axis=1, inplace=True)
print(other_patients.shape)

other_patients_y = other_patients[['Class']].copy()
print(other_patients_y.shape)

(103669, 15)
(103669, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [17]:
other_patients_X_train_and_val, other_patients_X_test, other_patients_y_train_and_val, other_patients_y_test = train_test_split(other_patients_X, other_patients_y, test_size=0.2, random_state=42)
other_patients_X_train, other_patients_X_val, other_patients_y_train, other_patients_y_val = train_test_split(other_patients_X_train_and_val, other_patients_y_train_and_val, test_size=0.25, random_state=42)

In [18]:
other_patients_y_val[other_patients_y_val['Class']==1].shape

(22, 1)

In [19]:
other_patients_y_train[other_patients_y_train['Class']==1].shape

(100, 1)

In [20]:
other_patients_y_test[other_patients_y_test['Class']==1].shape

(28, 1)

#### Save to Files

In [21]:
patients.to_csv("../../Data/standardized_patients.csv")

other_patients_X_train.to_csv("../../Data/other_patients/X_train_total.csv")
other_patients_y_train.to_csv("../../Data/other_patients/y_train_total.csv")

other_patients_X_test.to_csv("../../Data/other_patients/X_test.csv")
other_patients_y_test.to_csv("../../Data/other_patients/y_test.csv")

other_patients_X_val.to_csv("../../Data/other_patients/X_val.csv")
other_patients_y_val.to_csv("../../Data/other_patients/y_val.csv")