In [61]:
# Import necessary libraries
import sqlite3  # For database interactions
import pandas as pd  # For handling and analyzing data

# Import machine learning tools
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For model evaluation

# Import machine learning models
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.neural_network import MLPClassifier  # Classification Neural Network

# Import TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam


In [62]:
# GET DF FROM DB

# Connect to .db file (or create it if it doesn't exist)
conn = sqlite3.connect("../heart.db")

query = "SELECT * FROM heart_attack_risk"

# Load the query result into a DataFrame
df = pd.read_sql_query(query, conn)

conn.close()

df.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic Pressure,Diastolic Pressure,Sex_Female,Sex_Male,Diet_Average,Diet_Healthy,Diet_Unhealthy
0,67,208,72,0,0,1,0,0,4,0,...,0,6,0,158,88,0,1,1,0,0
1,21,389,98,1,1,1,1,1,1,1,...,1,7,0,165,93,0,1,0,0,1
2,21,324,72,1,0,0,0,0,2,1,...,4,4,0,174,99,1,0,0,1,0
3,84,383,73,1,1,1,0,1,9,1,...,3,4,0,163,100,0,1,1,0,0
4,66,318,93,1,1,1,1,0,5,1,...,1,5,0,91,88,0,1,0,0,1


In [None]:
# IS target column balanced?
df['Heart Attack Risk'].value_counts()

Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64

In [None]:
def createModel(model_type, feature_importance=0):
    """
    Trains and evaluates a classification model for predicting heart attack risk.

    Parameters:
    -----------
    model_type : str
        The type of model to train. Accepted values:
        - 'LogisticRegression' : Logistic Regression model.
        - 'RandomForestClassifier' : Random Forest classifier.
        - 'MLPClassifier' : Multi-Layer Perceptron (Neural Network).
        - 'TensorFlowNN' : A basic TensorFlow (Keras) Neural Network.

    feature_importance : int, optional (default=0)
        If set to 1 and `model_type` is 'RandomForestClassifier', returns feature importance 
        as a Pandas Series.

    Process:
    --------
    1. Splits the dataset into features (X) and target variable (y).
    2. Scales the features using `StandardScaler`.
    3. Splits the data into training and testing sets (70% train, 30% test).
    4. Initializes and trains the specified model with predefined hyperparameters:
        - `LogisticRegression`: Uses `max_iter=1000`.
        - `RandomForestClassifier`: Uses `n_estimators=500`, `class_weight='balanced'`, `random_state=42`.
        - `MLPClassifier`: Uses `hidden_layer_sizes=(256, 128, 64, 32)`, `activation='relu'`, `solver='adam'`, and `max_iter=1000`.
        - `TensorFlowNN`: A simple Keras Sequential model with a few Dense layers.
    5. Evaluates the model using accuracy, classification report, and confusion matrix.
    6. If `RandomForestClassifier` is used and `feature_importance=1`, returns a Pandas Series 
       containing feature importance values sorted in descending order.

    Returns:
    --------
    float
        The accuracy score of the trained model.

    pd.Series (optional)
        If `model_type` is 'RandomForestClassifier' and `feature_importance=1`, returns a 
        Pandas Series with feature names as the index and their importance scores.

    Raises:
    -------
    ValueError
        If an invalid `model_type` is provided.

    Dependencies:
    -------------
    - Requires a pre-defined pandas DataFrame `df` containing the dataset.
    - Requires the following imports:
        ```python
        from sklearn.model_selection import train_test_split
        from sklearn.preprocessing import StandardScaler
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
        import pandas as pd

        # For TensorFlow model
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense
        ```

    Example Usage:
    --------------
    >>> createModel('LogisticRegression')
    >>> createModel('RandomForestClassifier', feature_importance=1)
    >>> createModel('MLPClassifier')
    >>> createModel('TensorFlowNN')
    """

    # Separate into X and y
    X = df.drop('Heart Attack Risk', axis=1)
    y = df['Heart Attack Risk']

    # scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=1, stratify=y)

    model = None

    # MODEL SELECTION
    if model_type == 'LogisticRegression':
        model = LogisticRegression(max_iter=2000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    elif model_type == 'RandomForestClassifier':
        model = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    elif model_type == 'MLPClassifier':
        model = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 32), activation='relu', solver='adam', max_iter=2000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    elif model_type == 'TensorFlowNN':
        # Build a simple Dense model
        

        # Build model
        model = Sequential([
            Dense(256, input_dim=X_train.shape[1]),
            LeakyReLU(alpha=0.1),
            BatchNormalization(),
            Dropout(0.3),

            Dense(128),
            LeakyReLU(alpha=0.1),
            BatchNormalization(),
            Dropout(0.3),

            Dense(64),
            LeakyReLU(alpha=0.1),
            BatchNormalization(),
            Dropout(0.2),

            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid')  # Binary classification
        ])

        # Compile model with custom learning rate
        optimizer = Adam(learning_rate=0.0005)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


        model.fit(X_train, y_train, epochs=200, batch_size=64, validation_split=0.2, verbose=0)


        # Predictions
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype("int32").flatten()

    else:
        raise ValueError("Invalid model type")

    # Evaluate the model
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:")
    print('True Negative    False Positive')
    print(confusion_matrix(y_test, y_pred))
    print('False Negative   True Positive')

    # Return feature importance if needed
    if model_type == 'RandomForestClassifier' and feature_importance == 1:
        importance_series = pd.Series(
            model.feature_importances_, 
            index=X.columns
        ).sort_values(ascending=False)
        return importance_series



In [65]:
# Logistic Regression Model
createModel('LogisticRegression')

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1687
           1       0.00      0.00      0.00       942

    accuracy                           0.64      2629
   macro avg       0.32      0.50      0.39      2629
weighted avg       0.41      0.64      0.50      2629

Confusion Matrix:
True Negative    False Positive
[[1687    0]
 [ 942    0]]
False Negative   True Positive


In [66]:
# Random Forest Model
importance_series = createModel('RandomForestClassifier',feature_importance=1)

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1687
           1       0.33      0.00      0.00       942

    accuracy                           0.64      2629
   macro avg       0.49      0.50      0.39      2629
weighted avg       0.53      0.64      0.50      2629

Confusion Matrix:
True Negative    False Positive
[[1685    2]
 [ 941    1]]
False Negative   True Positive


In [67]:
# Remove low importance columns

# Define a threshold for low importance (e.g., below 0.02)
low_importance_threshold = 0.02

# Select features with importance below the threshold
drop_columns = importance_series[importance_series < low_importance_threshold].index.tolist()

print(drop_columns)

df.drop(columns=drop_columns, inplace=True, axis=1)
df.head()

['Previous Heart Problems', 'Family History', 'Alcohol Consumption', 'Medication Use', 'Diabetes', 'Obesity', 'Diet_Unhealthy', 'Diet_Average', 'Diet_Healthy', 'Sex_Female', 'Sex_Male', 'Smoking']


Unnamed: 0,Age,Cholesterol,Heart Rate,Exercise Hours Per Week,Stress Level,Sedentary Hours Per Day,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic Pressure,Diastolic Pressure
0,67,208,72,4,9,6,31,286,0,6,0,158,88
1,21,389,98,1,1,4,27,235,1,7,0,165,93
2,21,324,72,2,9,9,28,587,4,4,0,174,99
3,84,383,73,9,9,7,36,378,3,4,0,163,100
4,66,318,93,5,6,1,21,231,1,5,0,91,88


In [68]:
#Try Random Forest again
createModel('RandomForestClassifier')

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      1687
           1       0.46      0.01      0.02       942

    accuracy                           0.64      2629
   macro avg       0.55      0.50      0.40      2629
weighted avg       0.58      0.64      0.51      2629

Confusion Matrix:
True Negative    False Positive
[[1674   13]
 [ 931   11]]
False Negative   True Positive


In [69]:
# Try Classification Neural Network
createModel('MLPClassifier')

Accuracy: 0.54
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.68      0.66      1687
           1       0.35      0.30      0.32       942

    accuracy                           0.54      2629
   macro avg       0.49      0.49      0.49      2629
weighted avg       0.53      0.54      0.54      2629

Confusion Matrix:
True Negative    False Positive
[[1144  543]
 [ 655  287]]
False Negative   True Positive


In [70]:
# Finally, call the new TensorFlow NN model
createModel('TensorFlowNN')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.60
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.86      0.73      1687
           1       0.35      0.14      0.20       942

    accuracy                           0.60      2629
   macro avg       0.50      0.50      0.47      2629
weighted avg       0.54      0.60      0.54      2629

Confusion Matrix:
True Negative    False Positive
[[1443  244]
 [ 809  133]]
False Negative   True Positive
