In [89]:
import pandas as pd

In [90]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')
df

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,MSV9918,60,Male,121,94/76,61,1,1,1,0,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,QSV6764,28,Female,120,157/102,73,1,0,0,1,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,XKA5925,47,Male,250,161/75,105,0,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,EPE6801,36,Male,178,119/67,60,1,0,1,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


### Data Cleaning

In [91]:
df = df.drop(['Patient ID'], axis = 1)
df

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,120,157/102,73,1,0,0,1,0,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,47,Male,250,161/75,105,0,1,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,36,Male,178,119/67,60,1,0,1,0,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


In [92]:
df.isna().sum()

Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

In [93]:
df.duplicated().sum()

0

### Transformation

##### Normalizing the columns - Age, Cholesterol, Heart Rate, Sedentary Hours Per Day, Income, Triglycerides (Z Score normalization)

In [94]:
columns = ['Age', 'Cholesterol', 'Heart Rate', 'Sedentary Hours Per Day', 'Income', 'Triglycerides']
for column in columns:
    i = df.columns.get_loc(column)
    mean = df[column].values.mean()
    stddev = df[column].values.std()
    for row in range(0, df[column].size):
        df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev

  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev
  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev
  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev
  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev
  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev


#### Binary Encoding of Sex and Hemisphere


In [95]:
columns = ['Sex', 'Hemisphere']
for column in columns:
    i = df.columns.get_loc(column)
    for row in range(0, df[column].size):
        value = df.iloc[row, i]
        if value == "Male" or value == "Northern Hemisphere":
            df.iloc[row, i] = 1
        else:
            df.iloc[row, i] = 0

#### Encoding Diet, Country, Continent

In [96]:
columns = ['Diet', 'Country', 'Continent']
dicts = []
for column in columns:
    vocabulary = {}
    words = set(df[column])
    for i, word in enumerate(words):
        vocabulary[word] = i
    dicts.append(vocabulary)

for i, column in enumerate(columns):
    vocabulary = dicts[i]
    col_ind = df.columns.get_loc(column)
    for row in range(0, df[column].size):
        df.iloc[row, col_ind] = vocabulary[df.iloc[row, col_ind]]

#### Splitting Blodd pressure into two columns Higher and Lower

In [97]:
df[['Upper Blood Pressure', 'Lower Blood Pressure']] = df['Blood Pressure'].str.split('/', expand=True)
df['Upper Blood Pressure'] = pd.to_numeric(df['Upper Blood Pressure'])
df['Lower Blood Pressure'] = pd.to_numeric(df['Lower Blood Pressure'])
df = df.drop(columns=['Blood Pressure'])
df.insert(4, 'Upper Blood Pressure', df.pop('Upper Blood Pressure'))
df.insert(5, 'Lower Blood Pressure', df.pop('Lower Blood Pressure'))

In [99]:
columns = ['Upper Blood Pressure', 'Lower Blood Pressure']
for column in columns:
    i = df.columns.get_loc(column)
    mean = df[column].values.mean()
    stddev = df[column].values.std()
    for row in range(0, df[column].size):
        df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev

  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev
  df.iloc[row, i] = (df.iloc[row, i] - mean) / stddev


### Splitting and Saving

In [100]:
m, n = df.shape
split = int(0.7 * m)
train = df[:split]
test = df[split:]
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)