# Dataset cleansing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 1. Load the (standardized) Data

In [2]:
patients = pd.read_csv("../../Data/standardized_patients.csv",index_col=0, header=0)
patients.dropna(inplace=True)
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,-1.029482,-0.840602,-0.656408,-0.024365,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,-0.275878,-0.428469,1.339588,0.675763,0.284575,0.94211,0
1,0.971362,-0.592293,0.408319,1.063574,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,4.052715,2.958895,0.483531,0.675763,-0.088975,0.016451,0
2,-1.029482,1.021712,-0.229242,-1.438686,-0.037838,1.436838,-0.423252,-0.80864,-0.08895,-0.134216,-0.430144,-0.979732,1.280054,-0.448536,-0.462525,0.571847,0
3,-1.029482,0.649249,0.472076,-1.819464,-0.037838,1.436838,2.362657,0.760768,-0.08895,-0.134216,-0.788571,-1.001492,-0.190283,-0.726827,-0.437622,-0.168681,0
4,0.971362,-0.716448,1.122388,0.519604,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,0.801693,-0.109317,-0.351898,-0.548721,-0.512332,0.078161,0


In [3]:
patients = patients.sample(frac=1).reset_index(drop=True)

## 2. Create the two dataset from the t-SNE analysis

### 2.1 Women in menopause condition

In [4]:
women_menopause = patients[patients['menopause']>1]

In [5]:
women_menopause.drop(columns=['sex','menopause'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [6]:
women_menopause.head()

Unnamed: 0,age,weight,height,HIPX,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
5,0.40094,1.562305,-0.459541,-0.037838,-0.423252,-0.80864,-0.08895,-0.134216,-0.401763,0.558001,-0.743991,-0.459667,-0.537235,-0.538945,0
8,1.518328,-1.957033,-1.547479,-0.037838,2.362657,-0.80864,-0.08895,-0.134216,0.96374,0.619655,1.558791,3.948473,0.135155,-0.168681,0
11,-0.343985,-1.109077,-1.00351,-0.037838,2.362657,2.330176,-0.08895,-0.134216,-0.788571,0.043005,-0.274591,-0.570984,-0.313105,0.695268,0
12,1.145866,-0.146359,-1.221098,-0.037838,-0.423252,-0.80864,-0.08895,-0.134216,-0.440215,-0.145584,-0.332457,-0.415141,-0.088975,-0.538945,0
14,1.394174,-1.153706,0.41081,-0.037838,2.362657,0.760768,-0.08895,-0.134216,-0.259857,-0.667833,-0.300554,0.208233,3.571818,-0.538945,0


#### Create the X and y

In [7]:
women_X = women_menopause.copy()
women_X.drop("Class", axis=1, inplace=True)
print(women_X.shape)

women_y = women_menopause[['Class']].copy()
print(women_y.shape)

(50215, 14)
(50215, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [8]:
women_X_train_and_val, women_X_test, women_y_train_and_val, women_y_test = train_test_split(women_X, women_y, test_size=0.2, random_state=42)
women_X_train, women_X_val, women_y_train, women_y_val = train_test_split(women_X_train_and_val, women_y_train_and_val, test_size=0.25, random_state=42)

In [9]:
women_y_val[women_y_val['Class']==1].shape

(25, 1)

In [10]:
women_y_train[women_y_train['Class']==1].shape

(64, 1)

In [11]:
women_y_test[women_y_test['Class']==1].shape

(39, 1)

#### Save to Files

In [12]:
patients.to_csv("../../Data/women_menopause/standardized_patients.csv")

women_X_train.to_csv("../../Data/women_menopause/X_train_total.csv")
women_y_train.to_csv("../../Data/women_menopause/y_train_total.csv")

women_X_test.to_csv("../../Data/women_menopause/X_test.csv")
women_y_test.to_csv("../../Data/women_menopause/y_test.csv")

women_X_val.to_csv("../../Data/women_menopause/X_val.csv")
women_y_val.to_csv("../../Data/women_menopause/y_val.csv")

### 2.2 Other patients

In [13]:
other_patients = patients[patients['menopause']<1]

In [14]:
other_patients.drop(columns=['menopause','HRT'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [15]:
other_patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,-1.029482,-0.840602,2.136111,-0.350747,-0.037838,-0.80864,-0.08895,-0.134216,-0.484617,0.960568,-0.537706,-0.593247,-0.537235,-0.538945,0
1,0.971362,0.40094,0.720724,0.302016,-0.037838,2.330176,-0.08895,-0.134216,0.452878,-0.254386,0.105317,0.208233,-0.014265,0.849545,0
2,0.971362,-0.716448,-0.267496,-0.133159,-0.037838,-0.80864,-0.08895,-0.134216,-0.239257,0.873526,0.342266,-0.526457,-0.462525,-0.477234,0
3,0.971362,-0.468139,0.561334,2.260306,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,-0.468363,-0.94345,-0.392877,0.433996,1.003821,0
4,0.971362,-1.337219,-0.548023,0.519604,-0.037838,-0.80864,-0.08895,-0.134216,-0.341796,-0.504631,1.16872,-0.459667,-0.57459,-0.261247,0


#### Create the X and y

In [16]:
other_patients_X = other_patients.copy()
other_patients_X.drop("Class", axis=1, inplace=True)
print(other_patients.shape)

other_patients_y = other_patients[['Class']].copy()
print(other_patients_y.shape)

(103669, 15)
(103669, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [17]:
other_patients_X_train_and_val, other_patients_X_test, other_patients_y_train_and_val, other_patients_y_test = train_test_split(other_patients_X, other_patients_y, test_size=0.2, random_state=42)
other_patients_X_train, other_patients_X_val, other_patients_y_train, other_patients_y_val = train_test_split(other_patients_X_train_and_val, other_patients_y_train_and_val, test_size=0.25, random_state=42)

In [18]:
other_patients_y_val[other_patients_y_val['Class']==1].shape

(21, 1)

In [19]:
other_patients_y_train[other_patients_y_train['Class']==1].shape

(94, 1)

In [20]:
other_patients_y_test[other_patients_y_test['Class']==1].shape

(35, 1)

#### Save to Files

In [21]:
patients.to_csv("../../Data/other_patients/standardized_patients.csv")

other_patients_X_train.to_csv("../../Data/other_patients/X_train_total.csv")
other_patients_y_train.to_csv("../../Data/other_patients/y_train_total.csv")

other_patients_X_test.to_csv("../../Data/other_patients/X_test.csv")
other_patients_y_test.to_csv("../../Data/other_patients/y_test.csv")

other_patients_X_val.to_csv("../../Data/other_patients/X_val.csv")
other_patients_y_val.to_csv("../../Data/other_patients/y_val.csv")