# Dataset cleansing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 1. Load the (standardized) Data

In [2]:
patients = pd.read_csv("../../Data/standardized_patients.csv",index_col=0, header=0)
patients.dropna(inplace=True)
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,-1.029482,-0.840602,-0.656408,-0.024365,-0.037838,-0.695973,-0.423252,-0.80864,-0.08895,-0.134216,-0.275878,-0.428469,1.339588,0.675763,0.284575,0.94211,0
1,0.971362,-0.592293,0.408319,1.063574,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,4.052715,2.958895,0.483531,0.675763,-0.088975,0.016451,0
2,-1.029482,1.021712,-0.229242,-1.438686,-0.037838,1.436838,-0.423252,-0.80864,-0.08895,-0.134216,-0.430144,-0.979732,1.280054,-0.448536,-0.462525,0.571847,0
3,-1.029482,0.649249,0.472076,-1.819464,-0.037838,1.436838,2.362657,0.760768,-0.08895,-0.134216,-0.788571,-1.001492,-0.190283,-0.726827,-0.437622,-0.168681,0
4,0.971362,-0.716448,1.122388,0.519604,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,-0.134216,0.801693,-0.109317,-0.351898,-0.548721,-0.512332,0.078161,0


In [3]:
patients = patients.sample(frac=1).reset_index(drop=True)

## 2. Create the Three dataset possible: Men, Women not in Menopause condition, Women in Menopause condition

### 2.1 Men

In [4]:
male_patients = patients[patients['sex']>0]

In [5]:
male_patients.drop(columns=['menopause','HRT','sex'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [6]:
male_patients.head()

Unnamed: 0,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
1,0.897557,-0.076227,0.269378,-0.037838,-0.80864,-0.08895,-0.134216,1.444848,1.160038,0.452512,1.143293,0.508706,1.312374,0
2,0.897557,1.128764,1.281161,-0.037838,-0.80864,-0.08895,-0.134216,1.256708,-0.769382,-0.837679,-0.259297,-0.014265,-0.538945,0
5,0.028478,1.03313,1.607543,-0.037838,0.760768,-0.08895,-0.134216,-0.019532,-0.729488,-0.380591,-0.726827,-0.462525,-0.168681,0
10,1.394174,0.746227,1.172367,-0.037838,0.760768,-0.08895,-0.134216,-0.108795,-0.189105,0.215285,-0.593247,-0.611945,-0.538945,0
11,1.394174,2.971316,1.498749,-0.037838,0.760768,-0.08895,-0.134216,-0.788571,0.431065,1.194076,-0.493062,-0.611945,-0.538945,0


#### Create the X and y

In [7]:
male_patients_X = male_patients.copy()
male_patients_X.drop("Class", axis=1, inplace=True)
print(male_patients.shape)

male_patients_y = male_patients[['Class']].copy()
print(male_patients_y.shape)

(79177, 14)
(79177, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [8]:
male_patients_X_train_and_val, male_patients_X_test, male_patients_y_train_and_val, male_patients_y_test = train_test_split(male_patients_X, male_patients_y, test_size=0.2, random_state=42)
male_patients_X_train, male_patients_X_val, male_patients_y_train, male_patients_y_val = train_test_split(male_patients_X_train_and_val, male_patients_y_train_and_val, test_size=0.25, random_state=42)

In [9]:
male_patients_y_val[male_patients_y_val['Class']==1].shape

(27, 1)

In [10]:
male_patients_y_train[male_patients_y_train['Class']==1].shape

(79, 1)

In [11]:
male_patients_y_test[male_patients_y_test['Class']==1].shape

(18, 1)

#### Save to Files

In [12]:
patients.to_csv("../../Data/male_patients/standardized_patients.csv")

male_patients_X_train.to_csv("../../Data/male_patients/X_train_total.csv")
male_patients_y_train.to_csv("../../Data/male_patients/y_train_total.csv")

male_patients_X_test.to_csv("../../Data/male_patients/X_test.csv")
male_patients_y_test.to_csv("../../Data/male_patients/y_test.csv")

male_patients_X_val.to_csv("../../Data/male_patients/X_val.csv")
male_patients_y_val.to_csv("../../Data/male_patients/y_val.csv")

### 2.2 Female patients not in Menopause condition

In [13]:
female_patients_no_menopause = patients[patients['menopause']<1]
female_patients_no_menopause = female_patients_no_menopause[female_patients_no_menopause['sex']<0]

In [14]:
female_patients_no_menopause.drop(columns=['menopause','HRT','sex'], axis=1, inplace=True)

In [15]:
female_patients_no_menopause.head()

Unnamed: 0,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
3,-1.585527,-0.82855,0.628398,-0.037838,0.760768,-0.08895,-0.134216,0.645138,-0.631566,1.492406,-0.671169,-0.611945,-0.538945,0
7,-0.592293,-0.943311,-0.785922,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,-0.210866,-0.465506,-0.526457,-0.42517,-0.538945,0
8,-1.213064,-0.924184,-0.133159,-0.037838,0.760768,-0.08895,-0.134216,-0.651243,-0.015022,-0.119296,-0.326087,-0.35046,-0.538945,0
28,-1.461373,0.982125,-0.785922,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,0.140927,-0.556868,0.876133,-0.611945,-0.168681,0
31,-1.585527,-0.924184,-2.309037,-0.037838,-0.80864,-0.08895,-0.134216,-0.788571,-0.646073,-0.94709,-0.504194,-0.412718,0.571847,0


#### Create the X and y

In [16]:
female_patients_no_menopause_X = female_patients_no_menopause.copy()
female_patients_no_menopause_X.drop("Class", axis=1, inplace=True)
print(female_patients_no_menopause.shape)

female_patients_no_menopause_y = female_patients_no_menopause[['Class']].copy()
print(female_patients_no_menopause_y.shape)

(24492, 14)
(24492, 1)


#### Split the data into Train, Test and Validation

The ratio in the splitting, in percentage, is 60/20/20

In [46]:
female_patients_no_menopause_X_train_and_val, female_patients_no_menopause_X_test, female_patients_no_menopause_y_train_and_val, female_patients_no_menopause_y_test = train_test_split(female_patients_no_menopause_X, female_patients_no_menopause_y, test_size=0.2, random_state=16)
female_patients_no_menopause_X_train, female_patients_no_menopause_X_val, female_patients_no_menopause_y_train, female_patients_no_menopause_y_val = train_test_split(female_patients_no_menopause_X_train_and_val, female_patients_no_menopause_y_train_and_val, test_size=0.25, random_state=16)

In [47]:
female_patients_no_menopause_y_val[female_patients_no_menopause_y_val['Class']==1].shape

(4, 1)

In [48]:
female_patients_no_menopause_y_train[female_patients_no_menopause_y_train['Class']==1].shape

(16, 1)

In [49]:
female_patients_no_menopause_y_test[female_patients_no_menopause_y_test['Class']==1].shape

(6, 1)

#### Save to Files

In [50]:
patients.to_csv("../../Data/female_patients_no_menopause/standardized_patients.csv")

female_patients_no_menopause_X_train.to_csv("../../Data/female_patients_no_menopause/X_train_total.csv")
female_patients_no_menopause_y_train.to_csv("../../Data/female_patients_no_menopause/y_train_total.csv")

female_patients_no_menopause_X_test.to_csv("../../Data/female_patients_no_menopause/X_test.csv")
female_patients_no_menopause_y_test.to_csv("../../Data/female_patients_no_menopause/y_test.csv")

female_patients_no_menopause_X_val.to_csv("../../Data/female_patients_no_menopause/X_val.csv")
female_patients_no_menopause_y_val.to_csv("../../Data/female_patients_no_menopause/y_val.csv")

### 2.3 Women in menopause condition

I'll use the same from the splitting through t-SNE