In [53]:
import pandas as pd
import numpy as npe
import seaborn as sns 
import matplotlib as plt

In [54]:
df = pd.read_csv("C:\\Users\\hksid\\Downloads\\Python 3\\Files\\Heart_disease.csv")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [55]:
df.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

Changing the male to gender. 0 means female, 1 means male. 

In [56]:
df.rename(columns={'male':'gender'},inplace=True)
df.head()

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


Now finding the missing values.

We have 105 missing values in Edcuation. We would replace the missing ones with unspecified.

In [57]:
df.isnull().sum()

gender               0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [58]:
df['education']=df['education'].fillna(5)

Now cigsPerday will be replaced by the mean. The reason behind this is people are often reluctant to disclose their habits which are unhealthy. Same for BMI. For BPMeds, we would replace with 0 which means "No BPMeds".

In [59]:
def fillmissing_numeric(table, col_name):
    table[col_name] = table[col_name].astype('float64')
    table[col_name].fillna(table[col_name].mean(), inplace=True)
fillmissing_numeric(df, 'cigsPerDay')
fillmissing_numeric(df, 'BMI')
df['BPMeds']=df['BPMeds'].fillna(0)
df.isnull().sum()

gender               0
age                  0
education            0
currentSmoker        0
cigsPerDay           0
BPMeds               0
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                  0
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

totChol and heartRate will be replaced by normal human body metrics.

In [60]:
df['totChol']=df['totChol'].fillna(170)
df['heartRate']=df['heartRate'].fillna(72)
df.isnull().sum()           

gender               0
age                  0
education            0
currentSmoker        0
cigsPerDay           0
BPMeds               0
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol              0
sysBP                0
diaBP                0
BMI                  0
heartRate            0
glucose            388
TenYearCHD           0
dtype: int64

For glucose level, the minimum of the normal range is taken. 

In [61]:
df['glucose']=df['glucose'].fillna(70)

In [62]:
df.isnull().sum() 

gender             0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

Now we have no missing value

Now saving the file with no missing values, unscaled. This will be called into other files as "unscaled". 

In [63]:
df.to_csv(r'C:\Users\hksid\Downloads\Python 3\Files\HD_nomissing.csv',index=False)

Now separating the numeric from boolean. This is needed in visualizaton part. 

In [64]:
df_bool_tg = df[['gender', 'education', 'currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD']].copy()
df_bool= df[['gender', 'education', 'currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes']].copy()
df_numerics = df[['age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']].copy()


Creating a backup dataframe. 

In [12]:
df_back= df[['gender', 'education', 'currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose','TenYearCHD']].copy()



Scaling the data using standard scaler 

In [73]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

scale = StandardScaler()
scaled_fitted = scale.fit_transform(df_numerics)


scaling numeric dataframe. 

In [66]:
df_scaled_numerics = pd.DataFrame(scaled_fitted, columns=df_numerics.columns)

In [None]:
now adding the two dataframes: 

In [67]:
new_df = pd.concat([df_scaled_numerics, df_bool_tg], axis=1)

In [68]:
new_df.head()

Unnamed: 0,age,cigsPerDay,totChol,sysBP,diaBP,BMI,heartRate,glucose,gender,education,currentSmoker,BPMeds,prevalentStroke,prevalentHyp,diabetes,TenYearCHD
0,-1.234951,-0.757974,-0.91162,-1.195907,-1.082625,0.286943,0.342816,-0.167632,1,4.0,0,0.0,0,0,0,0
1,-0.418257,-0.757974,0.313245,-0.515187,-0.158988,0.719325,1.590331,-0.210935,0,2.0,0,0.0,0,0,0,0
2,-0.184916,0.925835,0.201893,-0.220209,-0.242955,-0.113502,-0.073022,-0.470751,1,1.0,1,0.0,0,0,0,0
3,1.3318,1.76774,-0.243512,0.800871,1.016549,0.682474,-0.904698,0.958239,0,3.0,1,0.0,0,1,0,1
4,-0.418257,1.178407,1.092704,-0.106755,0.092912,-0.663807,0.758655,0.17879,0,3.0,1,0.0,0,0,0,0


In [69]:
new_df.to_csv(r'C:\\Users\\hksid\\Downloads\\Python 3\\Files\\df_scaled.csv',index=False)