# Data Rounding and Cleansing

In [1]:
import pandas as pd
import numpy as np

## 1. Load the sampled data

In [3]:
patients = pd.read_csv('../../Data/complete_dataset_zero_missings.csv', sep=",")
patients.drop(columns=["Unnamed: 0"], inplace=True)
patients.head()

Unnamed: 0,sex,age,ethnic,weight,height,BMI,waist,BMI_category,Waist_category,HIPX,...,calcium,dose_walk,dose_moderate,dose_vigorous,dose_pleasure,dose_sport,dose_execise,dose_lightDIY,dose_heavyDIY,Class
0,0,50,4,86.9,161.0,33.524941,88.0,3,8,0,...,384.586667,70,100,240,8,20,20,0,0,0
1,0,63,1,63.1,152.0,27.311288,86.0,2,7,0,...,1232.6425,360,120,0,8,0,6,0,0,0
2,0,56,1,54.0,163.0,20.324438,68.0,1,6,0,...,580.74,210,420,0,1,0,0,8,0,0
3,0,69,1,61.3,160.0,23.945312,86.0,1,7,0,...,502.64,120,60,20,8,0,8,0,0,0
4,0,64,1,86.4,158.0,34.609838,99.0,3,8,0,...,1388.29,840,1260,180,6,0,0,0,8,0


In [4]:
#patients = pd.read_csv("../Data/total_patients_balanced.csv",index_col=0, header=0)
#patients.dropna(inplace=True)
#patients.head()

## 2. Removing the unnecessary features

In [5]:
unnecessary_features = ['ethnic', 'BMI', 'BMI_category', 'Waist_category', 'waist', 'dose_pleasure','dose_sport','dose_execise','dose_lightDIY','dose_heavyDIY','Alcohol24']

In [6]:
patients.drop(unnecessary_features, axis=1, inplace=True)

In [7]:
patients.shape, patients.columns.values

((153884, 17),
 array(['sex', 'age', 'weight', 'height', 'HIPX', 'menopause', 'HRT',
        'smoking', 'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol',
        'VitaminD', 'calcium', 'dose_walk', 'dose_moderate',
        'dose_vigorous', 'Class'], dtype=object))

In [8]:
patients.reset_index(inplace=True)

In [9]:
patients.index.values

array([     0,      1,      2, ..., 153881, 153882, 153883])

In [10]:
patients_integer = patients.round(0)[['Class','sex','age','HIPX','menopause', 'HRT','smoking', 'ReumatoidArthritis', 'SecondaryOsteoporsis']] #'ethnic','BMI_category', 'Waist_category',
patients_float = patients.round(2)[['weight', 'height', 'Alcohol','VitaminD', 'calcium', 'dose_walk', 'dose_moderate','dose_vigorous']] #'BMI',,'Alcohol24', 'dose_plea'waist',sure', 'dose_sport', 'dose_execise','dose_lightDIY', 'dose_heavyDIY'
#patients = pd.concat([patients_integer,patients_float])

put the two dataframe together, then reorder the columns as the original dataset, and then check that every value in the original dataframe and the rounded dataframe are equals, to ensure that no data has been changed.

In [11]:
rounded_patients = patients_integer.join(patients_float)

In [12]:
patients.drop(['index'], axis=1, inplace=True)

In [13]:
rounded_patients.shape == patients.shape

True

In [14]:
rounded_patients = rounded_patients[patients.columns.tolist()]

In [15]:
patients.columns.tolist() == rounded_patients.columns.tolist()

True

In [16]:
for i in range(0,rounded_patients.shape[0]):
    results = rounded_patients.iloc[i] == patients.iloc[i].round(2)
    for key, value in results.to_dict().items() :
        if not value :
            if key not in patients_integer.columns.values:
                print("For the patients with index: ",i," there is a difference in: ", key)

## Randomize data order

In [17]:
print("Dataframe shape before sorting: ",rounded_patients.shape)
rounded_patients = rounded_patients.sample(frac=1).reset_index(drop=True)
print("Dataframe shape after sorting: ",rounded_patients.shape)
rounded_patients.head()

Dataframe shape before sorting:  (153884, 17)
Dataframe shape after sorting:  (153884, 17)


Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,Class
0,0,52,81.7,174.0,0,1,0,0,0,0,36.48,0.91,776.77,280,0,0,0
1,1,66,82.6,175.0,0,0,0,0,0,0,17.64,1.74,784.64,240,720,240,0
2,0,51,75.1,160.0,0,0,0,1,0,0,9.76,10.69,920.13,1260,1680,180,0
3,0,41,74.7,157.0,0,0,0,0,0,0,15.66,0.36,611.05,900,0,0,0
4,0,51,69.4,175.0,0,1,0,1,0,1,31.32,1.04,1007.37,105,300,105,0


In [19]:
rounded_patients.to_csv("../../Data/rounded_patients.csv")