In [429]:
# Package imports
import pandas as pd 

In [430]:
# Loading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [431]:
train.columns


Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk'],
      dtype='object')

In [432]:
train.isnull().sum()

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

In [433]:
train.Diet

0       Unhealthy
1         Average
2         Average
3         Healthy
4         Healthy
          ...    
7005    Unhealthy
7006      Healthy
7007      Average
7008    Unhealthy
7009      Average
Name: Diet, Length: 7010, dtype: object

In [434]:
train.dtypes

Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [435]:
# I want to to make the continents an integer so I assigned each continent a number
train['Continent'] = train['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
# The same thing has to be done with Hemisphere
train['Hemisphere'] = train['Hemisphere'].map({'Northern': 0, 'Southern': 1,})
# Men you're a zero #feminism. 
train["Sex"] = train["Sex"].map({'Male': 0, 'Female': 1})
# Exercise hours per week is an float but I want it to be an integer
train["Exercise Hours Per Week"] = train["Exercise Hours Per Week"].astype(int)
# Diet is object but I want to map it to an integer
train.Diet = train.Diet.map({'Unhealthy': 0, 'Average': 1, 'Healthy': 2})
# Sedentary hours per day is an float but I want it to be an integer. 
train["Sedentary Hours Per Day"] = train["Sedentary Hours Per Day"].astype(int)
# The same is true for BMI
train["BMI"] = train["BMI"].astype(int)

In [436]:
# We need to do the same for the test data
test['Continent'] = test['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
test['Hemisphere'] = test['Hemisphere'].map({'Northern': 0, 'Southern': 1,})
test["Sex"] = test["Sex"].map({'Male': 0, 'Female': 1})
test["Exercise Hours Per Week"] = test["Exercise Hours Per Week"].astype(int)
test.Diet = test.Diet.map({'Unhealthy': 0, 'Average': 1, 'Healthy': 2})
test["Sedentary Hours Per Day"] = test["Sedentary Hours Per Day"].astype(int)
test["BMI"] = test["BMI"].astype(int)

In [437]:
train[['Systolic', 'Diastolic']] = train['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric type (optional)
train.Systolic = pd.to_numeric(train.Systolic)
train.Diastolic = pd.to_numeric(train.Diastolic)

In [438]:
test[['Systolic', 'Diastolic']] = test['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric type (optional)
test.Systolic = pd.to_numeric(test.Systolic)
test.Diastolic = pd.to_numeric(test.Diastolic)

In [439]:
train.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week              int64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day              int64
Income                               int64
BMI                                  int64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [440]:
features = ['Age', 'Sex', 'Cholesterol',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', "Systolic", "Diastolic"]
train_X = train[features]
train_y = train['Heart Attack Risk']

In [441]:
# I want to to make the continents an integer so I assigned each continent a number
test['Continent'] = test['Continent'].map({'Asia': 0, 'Europe': 1, 'Africa': 2, 'South America': 3, 'Oceania': 4, 'North America': 5, 'Other': 6})
# The same thing has to be done with Hemisphere
test['Hemisphere'] = test['Hemisphere'].map({'Northern': 0, 'Southern': 1,})

test_X = test[features]

In [442]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier
c = RandomForestClassifier(random_state = 1)
c.fit(X_train, y_train)

predictions = c.predict(X_test)

In [443]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(y_test, predictions)
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print(f"Confusion Matrix: {conf_matrix}")
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1 Score: {f1}")

# 0.76
# 0.80
# 0.81

Confusion Matrix: [[876  14]
 [502  10]]
Accuracy: 0.6319543509272468
Precision: 0.4166666666666667
Recall: 0.01953125
F1 Score: 0.03731343283582089


In [444]:
# from sklearn.ensemble import RandomForestClassifier
# c = RandomForestClassifier(random_state = 1)
# c.fit(train_X, train_y)

# predictions = c.predict(test_X)