# Data Standardization

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

## 1. Load the Data

In [13]:
patients = pd.read_csv("../Data/rounded_patients.csv", index_col=0)

In [14]:
patients.head()

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,1,51,92.4,174.0,0,0,0,1,0,0,62.57,3.04,1337.31,600,60,240,0
1,0,50,57.8,165.0,0,0,0,1,0,0,6.71,0.9,634.81,630,240,0,0
2,1,50,80.3,169.0,0,0,0,0,0,0,51.03,1.33,698.31,100,100,120,0
3,0,64,69.8,164.0,0,1,0,1,0,0,0.0,0.5,841.06,1400,75,20,0
4,1,57,98.2,182.0,0,0,0,0,0,0,16.42,1.52,914.41,360,180,180,0


## 2. Standardize the patients

### 2.1 Remove the categorical variable from the Data

In [15]:
patients.columns

Index(['sex', 'age', 'weight', 'height', 'HIPX', 'menopause', 'HRT', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous', 'Class'],
      dtype='object')

In [5]:
categorical_features = ['sex','HIPX','menopause','HRT','smoking', 'ReumatoidArthritis', 
                        'SecondaryOsteoporsis', 'Alcohol24', 'Class']

In [6]:
std_patients = patients.drop(labels=categorical_features, axis=1)
std_patients.shape

KeyError: "labels ['Alcohol24'] not contained in axis"

In [16]:
std_patients = patients

### 2.2 Standardize the features

In [17]:
std_patients = pd.DataFrame(StandardScaler().fit_transform(std_patients.iloc[:,0:std_patients.shape[1]-1]), columns=std_patients.columns.values[:std_patients.shape[1]-1])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## 3. Append the classification column and all the categorical features

In [11]:
for feature in categorical_features:
    std_patients[feature] = patients[feature]

KeyError: 'Alcohol24'

In [18]:
std_patients['Class'] = patients['Class']

In [19]:
std_patients

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,0.971362,-0.592293,0.918368,0.410810,-0.037838,-0.695973,-0.423252,0.760768,-0.088950,-0.134216,2.075643,0.050259,0.877823,0.608973,-0.462525,0.942110,0
1,-1.029482,-0.716448,-1.287594,-0.568335,-0.037838,-0.695973,-0.423252,0.760768,-0.088950,-0.134216,-0.481413,-0.725861,-0.898098,0.675763,-0.014265,-0.538945,0
2,0.971362,-0.716448,0.146919,-0.133159,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,1.547386,-0.569912,-0.737570,-0.504194,-0.362912,0.201583,0
3,-1.029482,1.021712,-0.522520,-0.677128,-0.037838,1.436838,-0.423252,0.760768,-0.088950,-0.134216,-0.788571,-0.870930,-0.376697,2.390039,-0.425170,-0.415524,0
4,0.971362,0.152632,1.288154,1.281161,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,-0.036927,-0.501004,-0.191269,0.074653,-0.163685,0.571847,0
5,0.971362,-0.964756,3.156209,1.226764,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,1.356042,-0.029529,-0.053492,2.479093,1.181096,0.201583,0
6,-1.029482,-1.833836,-1.230213,-1.112304,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-0.243506,-0.443588,3.681313,-0.163685,0.571847,0
7,-1.029482,0.773403,-0.235618,-0.350747,-0.037838,1.436838,-0.423252,0.760768,-0.088950,-0.134216,0.188292,0.195328,-0.739314,-0.058927,-0.462525,-0.415524,0
8,0.971362,-1.709681,1.434793,0.628398,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-1.034133,-2.042474,-0.637774,-0.587042,-0.477234,0
9,0.971362,-1.833836,1.001251,1.607543,-0.037838,-0.695973,-0.423252,2.330176,-0.088950,-0.134216,0.998072,-0.580792,-1.344265,0.208233,-0.611945,-0.538945,0


## 4. Save to File

In [20]:
std_patients.to_csv("../Data/standardized_patients.csv")