In [None]:
import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from google.colab import files
uploaded = files.upload()
#Importing the dataset
dataset = pd.read_csv('Dataset1.csv')
print(dataset.head())

Saving Dataset1.csv to Dataset1.csv
   HeartDiseaseorAttack  HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  \
0                     0       1         1          1   40       1       0   
1                     0       0         0          0   25       1       0   
2                     0       1         1          1   28       0       0   
3                     0       1         0          1   27       0       0   
4                     0       1         1          1   24       0       0   

   Diabetes  PhysActivity  Fruits  ...  AnyHealthcare  NoDocbcCost  GenHlth  \
0         0             0       0  ...              1            0        5   
1         0             1       0  ...              0            1        3   
2         0             0       1  ...              1            1        5   
3         0             1       1  ...              1            0        2   
4         0             1       1  ...              1            0        2   

   MentHlth  PhysHlth  Dif

In [None]:
# Replace 2 with 1 in the Diabetes column
dataset['Diabetes'] = dataset['Diabetes'].replace(2, 1)

In [None]:
# Preprocessing
numerical_indices = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
scaler = MinMaxScaler()
dataset[numerical_indices] = scaler.fit_transform(dataset[numerical_indices])

In [None]:
# Define features and targets for multi-classification
selected_features = ['GenHlth', 'BMI', 'Age', 'HighBP', 'PhysHlth', 'Income', 'HighChol', 'Education', 'DiffWalk', 'MentHlth', 'Sex']
X = dataset[selected_features]
y = dataset[['HeartDiseaseorAttack', 'Diabetes', 'Stroke']]

In [None]:
# Chicking the imbalance classes
print(y['HeartDiseaseorAttack'].value_counts())
print(y['Diabetes'].value_counts())
print(y['Stroke'].value_counts())

HeartDiseaseorAttack
0    229787
1     23893
Name: count, dtype: int64
Diabetes
0    213703
1     39977
Name: count, dtype: int64
Stroke
0    243388
1     10292
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample for HeartDiseaseorAttack
X_resampled_heart, y_resampled_heart = smote.fit_resample(X, y['HeartDiseaseorAttack'])

# Resample for Diabetes (use original X for each resampling)
X_resampled_diabetes, y_resampled_diabetes = smote.fit_resample(X, y['Diabetes'])

# Resample for Stroke (use original X for each resampling)
X_resampled_stroke, y_resampled_stroke = smote.fit_resample(X, y['Stroke'])

# Combine resampled targets into a DataFrame
y_resampled = pd.DataFrame({
    'HeartDiseaseorAttack': y_resampled_heart,
    'Diabetes': y_resampled_diabetes,
    'Stroke': y_resampled_stroke})



In [None]:
print("HeartDiseaseorAttack distribution after SMOTE:")
print(y_resampled['HeartDiseaseorAttack'].value_counts())

print("Diabetes distribution after SMOTE:")
print(y_resampled['Diabetes'].value_counts())

print("Stroke distribution after SMOTE:")
print(y_resampled['Stroke'].value_counts())

HeartDiseaseorAttack distribution after SMOTE:
HeartDiseaseorAttack
0.0    229787
1.0    229787
Name: count, dtype: int64
Diabetes distribution after SMOTE:
Diabetes
0.0    213703
1.0    213703
Name: count, dtype: int64
Stroke distribution after SMOTE:
Stroke
0    243388
1    243388
Name: count, dtype: int64


In [None]:
# Combine resampled targets into a DataFrame
y_resampled = pd.DataFrame({
    'HeartDiseaseorAttack': y_resampled_heart,
    'Diabetes': y_resampled_diabetes,
    'Stroke': y_resampled_stroke})

In [None]:
X_resampled = X_resampled_stroke  # Use features from the largest resampling

In [None]:
# Verify alignment
print("Features shape:", X_resampled.shape)
print("Targets shape:", y_resampled.shape)

Features shape: (486776, 11)
Targets shape: (486776, 3)


In [None]:
# Convert targets to numeric
print("Before cleaning:", y_resampled.dtypes)
y_resampled = y_resampled.apply(pd.to_numeric, errors='coerce')  # Ensure numeric
y_resampled = y_resampled.fillna(0)  # Replace NaN values with 0
y_resampled = y_resampled.astype(int)  # Convert to integers
print("After cleaning:", y_resampled.dtypes)

Before cleaning: HeartDiseaseorAttack    float64
Diabetes                float64
Stroke                    int64
dtype: object
After cleaning: HeartDiseaseorAttack    int64
Diabetes                int64
Stroke                  int64
dtype: object


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Train the multi-output classifier based on decision tree
base_model = DecisionTreeClassifier(max_depth=10, random_state=42)
multi_target_model = MultiOutputClassifier(base_model)
multi_target_model.fit(X_train, y_train)

In [None]:
# Making a prediction
y_pred = multi_target_model.predict(X_test)

In [None]:
# Visualizing the classification report for each target
from sklearn.metrics import classification_report
print("HeartDiseaseorAttack Report:")
print(classification_report(y_test['HeartDiseaseorAttack'], y_pred[:, 0]))

print("Diabetes Report:")
print(classification_report(y_test['Diabetes'], y_pred[:, 1]))

print("Stroke Report:")
print(classification_report(y_test['Stroke'], y_pred[:, 2]))

HeartDiseaseorAttack Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.82     51472
           1       0.79      0.81      0.80     45884

    accuracy                           0.81     97356
   macro avg       0.81      0.81      0.81     97356
weighted avg       0.81      0.81      0.81     97356

Diabetes Report:
              precision    recall  f1-score   support

           0       0.79      0.71      0.75     54661
           1       0.68      0.76      0.72     42695

    accuracy                           0.74     97356
   macro avg       0.73      0.74      0.73     97356
weighted avg       0.74      0.74      0.74     97356

Stroke Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     48689
           1       0.86      0.82      0.84     48667

    accuracy                           0.85     97356
   macro avg       0.85      0.85      0.85     97356
weighted avg 

In [None]:
# User input for prediction
def get_user_input():
    print("Provide the following details for prediction:")
    user_input = {
        "HighBP": int(input("Enter High Blood Pressure (0 for No, 1 for Yes): ")),
        "HighChol": int(input("Enter High Cholesterol (0 for No, 1 for Yes): ")),
        "BMI": float(input("Enter BMI: ")),
        "DiffWalk": int(input("Enter Difficulty Walking (0 for No, 1 for Yes): ")),
        "Sex": int(input("Enter Sex (0 for Female, 1 for Male): ")),
        "Age": int(input("Enter Age (in years): ")),
        "Education": int(input("Enter Education Level (1-6, 1 being never attended school): ")),
        "Income": int(input("Enter Income Level (1-8, 1 being lowest income): ")),
        "GenHlth": int(input("Enter General Health (1-5, 1 being excellent): ")),
        "MentHlth": int(input("Enter Mental Health (0-30, 0 being no mental health issues): ")),
        "PhysHlth": int(input("Enter Physical Health (0-30, 0 being no physical health issues): ")),}
    return user_input

# Convert user input into a DataFrame
user_input_data = get_user_input()
user_input_df = pd.DataFrame([user_input_data])

# Scale numerical inputs
numerical_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
user_input_df[numerical_columns] = scaler.transform(user_input_df[numerical_columns])

# Predict the outcomes
predictions = multi_target_model.predict(user_input_df[selected_features])

# Map predictions to disease outcomes
outcomes = {
    "HeartDiseaseorAttack": "Heart Disease or Attack",
    "Diabetes": "Diabetes",
    "Stroke": "Stroke"}

print("\nPredictions:")
for idx, disease in enumerate(outcomes.keys()):
    result = "High Risk" if predictions[0][idx] == 1 else "Low Risk"
    print(f"{outcomes[disease]}: {result}")

Provide the following details for prediction:
Enter High Blood Pressure (0 for No, 1 for Yes): 1
Enter High Cholesterol (0 for No, 1 for Yes): 1
Enter BMI: 25
Enter Difficulty Walking (0 for No, 1 for Yes): 0
Enter Sex (0 for Female, 1 for Male): 0
Enter Age (in years): 9
Enter Education Level (1-6, 1 being never attended school): 5
Enter Income Level (1-8, 1 being lowest income): 3
Enter General Health (1-5, 1 being excellent): 2
Enter Mental Health (0-30, 0 being no mental health issues): 15
Enter Physical Health (0-30, 0 being no physical health issues): 5

Predictions:
Heart Disease or Attack: Low Risk
Diabetes: Low Risk
Stroke: Low Risk
