In [16]:
import pandas as pd

df = pd.read_csv('dataset/lifestyle.csv')
df.drop(df.columns[0], axis=1, inplace=True)
print(df.head())


   ID  Age  Gender  Height_cm  Weight_kg    BMI  Daily_Steps  Calories_Intake  \
0   1   56    Male        164         81  30.72         5134             1796   
1   2   69    Male        156         82  20.86        12803             1650   
2   3   46  Female        158         65  30.93        16408             1756   
3   4   32    Male        197         87  31.19        18420             2359   
4   5   60    Male        157         63  29.37        17351             2556   

   Hours_of_Sleep  Heart_Rate Blood_Pressure  Exercise_Hours_per_Week Smoker  \
0             8.6         102         137/72                      8.1     No   
1             4.5         103         129/65                      3.7     No   
2             4.3          74         127/68                      3.2    Yes   
3             4.1         116         125/86                      8.5     No   
4             5.1         111         100/64                      8.5    Yes   

   Alcohol_Consumption_per_Week 

In [17]:
import pandas as pd

df = pd.read_csv('dataset/lifestyle.csv')

# Drop the first column if it's unnecessary
df.drop(df.columns[0], axis=1, inplace=True)

# Split Blood_Pressure into Systolic and Diastolic columns
bp_split = df['Blood_Pressure'].str.split('/', expand=True)
df['Systolic_BP'] = pd.to_numeric(bp_split[0])
df['Diastolic_BP'] = pd.to_numeric(bp_split[1])

# Drop the original Blood_Pressure column
df.drop('Blood_Pressure', axis=1, inplace=True)

# Encode categorical columns
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Smoker'] = df['Smoker'].map({'No': 0, 'Yes': 1})
df['Diabetic'] = df['Diabetic'].map({'No': 0, 'Yes': 1})
df['Heart_Disease'] = df['Heart_Disease'].map({'No': 0, 'Yes': 1})
df['Risk_Level'] = df['Risk_Level'].map({'Low Risk': 0, 'Moderate Risk': 1, 'High Risk': 2})

# Print the first few rows of the modified dataframe
print(df.head())


   ID  Age  Gender  Height_cm  Weight_kg    BMI  Daily_Steps  Calories_Intake  \
0   1   56       0        164         81  30.72         5134             1796   
1   2   69       0        156         82  20.86        12803             1650   
2   3   46       1        158         65  30.93        16408             1756   
3   4   32       0        197         87  31.19        18420             2359   
4   5   60       0        157         63  29.37        17351             2556   

   Hours_of_Sleep  Heart_Rate  Exercise_Hours_per_Week  Smoker  \
0             8.6         102                      8.1       0   
1             4.5         103                      3.7       0   
2             4.3          74                      3.2       1   
3             4.1         116                      8.5       0   
4             5.1         111                      8.5       1   

   Alcohol_Consumption_per_Week  Diabetic  Heart_Disease  Risk_Level  \
0                             7         0   

In [18]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Risk_Level'])
Y = df['Risk_Level']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)


accuracy_train = accuracy_score(y_train, y_train_pred_rf)
accuracy_test = accuracy_score(y_test, y_test_pred_rf)
print(f"Accuracy: {accuracy_train*100:.2f}%") # 100%
print(f"Accuracy: {accuracy_test*100:.2f}%") # 85%


Accuracy: 100.00%
Accuracy: 85.00%


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_rf_model = grid_search.best_estimator_

y_train_pred_best_rf = best_rf_model.predict(X_train)
y_test_pred_best_rf = best_rf_model.predict(X_test)

accuracy_train_best = accuracy_score(y_train, y_train_pred_best_rf)
accuracy_test_best = accuracy_score(y_test, y_test_pred_best_rf)

print(f"Best Hyperparameters: {best_params}")
print(f"Training Accuracy: {accuracy_train_best*100:.2f}%")
print(f"Testing Accuracy: {accuracy_test_best*100:.2f}%")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_de