In [60]:
# Package imports
import pandas as pd 
import numpy as np

In [61]:
# Loading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [62]:
len(train.columns)

26

In [63]:
train.isnull().sum()

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

In [64]:
train.Diet

0       Unhealthy
1         Average
2         Average
3         Healthy
4         Healthy
          ...    
7005    Unhealthy
7006      Healthy
7007      Average
7008    Unhealthy
7009      Average
Name: Diet, Length: 7010, dtype: object

In [65]:
train.dtypes

Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [66]:
# I want to to make the continents an integer so I assigned each continent a number
train['Continent'] = train['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
# The same thing has to be done with Hemisphere
train['Hemisphere'] = train['Hemisphere'].map({'Northern': 0, 'Southern': 1,})
# Men you're a zero #feminism  
train["Sex"] = train["Sex"].map({'Male': 0, 'Female': 1})
# Exercise hours per week is an float but I want it to be an integer
train["Exercise Hours Per Week"] = train["Exercise Hours Per Week"].astype(int)
# Diet is object but I want to map it to an integer
train.Diet = train.Diet.map({'Unhealthy': 0, 'Average': 1, 'Healthy': 2})
# Sedentary hours per day is an float but I want it to be an integer. 
train["Sedentary Hours Per Day"] = train["Sedentary Hours Per Day"].astype(int)
# The same is true for BMI
train["BMI"] = train["BMI"].astype(int)

In [67]:
# We need to do the same for the test data
test['Continent'] = test['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
test['Hemisphere'] = test['Hemisphere'].map({'Northern': 0, 'Southern': 1,})
test["Sex"] = test["Sex"].map({'Male': 0, 'Female': 1})
test["Exercise Hours Per Week"] = test["Exercise Hours Per Week"].astype(int)
test.Diet = test.Diet.map({'Unhealthy': 0, 'Average': 1, 'Healthy': 2})
test["Sedentary Hours Per Day"] = test["Sedentary Hours Per Day"].astype(int)
test["BMI"] = test["BMI"].astype(int)

In [68]:
# We need to split the blood pressure column into two columns. 
train[['Systolic', 'Diastolic']] = train['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric type (optional)
train.Systolic = pd.to_numeric(train.Systolic)
train.Diastolic = pd.to_numeric(train.Diastolic)

In [69]:
test[['Systolic', 'Diastolic']] = test['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric type (optional)
test.Systolic = pd.to_numeric(test.Systolic)
test.Diastolic = pd.to_numeric(test.Diastolic)

In [70]:
train.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week              int64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day              int64
Income                               int64
BMI                                  int64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [71]:
features = ['Age', 'Sex', 'Cholesterol',
       'Heart Rate', 'Family History', 'Smoking', 'Obesity', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', "Systolic", "Diastolic", "Diabetes", "Hemisphere"]
train_X = train[features]
train_y = train['Heart Attack Risk']
len(features)

20

In [72]:
# I want to to make the continents an integer so I assigned each continent a number
# TODO: This is kinda bad. The continents should be numbered in some kind of intentional order.
test['Continent'] = test['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
# The same thing has to be done with Hemisphere
test['Hemisphere'] = test['Hemisphere'].map({'Northern': 0, 'Southern': 1,})

test_X = test[features]

This block of code below is my testing setup

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestClassifier
c = RandomForestClassifier(random_state= 5, n_jobs= -1, max_depth= 8, criterion= 'gini', min_samples_split = 10, verbose= 1, warm_start= True, max_samples= 24)
c.fit(X_train, y_train)

predictions = c.predict(X_test)

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, predictions)
print(f"Accuracy: {acc}")
# Highest 0.6283880171184023

Accuracy: 0.6283880171184023


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [74]:
# from sklearn.ensemble import RandomForestClassifier
# c = RandomForestClassifier(random_state = 1)
# c.fit(train_X, train_y)

# predictions = c.predict(test_X)
# id = test['Patient ID']

# predictions = np.array(predictions)
# id = np.array(id)
# pdf = pd.DataFrame({"Patient ID": id, "Heart Attack Risk": predictions})
# pdf.to_csv("submission.csv", index=False)
