In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv('df_cleaned.csv')

# One-hot encode the 'Race' variable
encoder = OneHotEncoder()
# Ensure the encoder returns a dense array by setting sparse=False
race_encoded = encoder.fit_transform(df[['Race']]).toarray()

# Create a DataFrame from the encoded data with the correct indices
race_encoded_df = pd.DataFrame(race_encoded,
                               columns=encoder.get_feature_names_out(['Race']),
                               index=df.index)

# Combine the new DataFrame with the original DataFrame, drop the original 'Race' column    
df = pd.concat([df.drop('Race', axis=1), race_encoded_df], axis=1)

# Define features X and target y
X = df.drop('HeartDisease', axis=1)  # assuming 'HeartDisease' is your target variable
y = df['HeartDisease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if the model doesn't converge

# Fit the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Print the classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Print the model coefficients for the 'Race' features
race_features = [col for col in X.columns if 'Race_' in col]
coefficients = pd.DataFrame(model.coef_.flatten(), index=X.columns, columns=['Coefficient'])
print(coefficients.loc[race_features])


              precision    recall  f1-score   support

           0       0.92      0.99      0.95     54108
           1       0.53      0.10      0.17      5327

    accuracy                           0.91     59435
   macro avg       0.73      0.55      0.56     59435
weighted avg       0.88      0.91      0.88     59435

[[53632   476]
 [ 4784   543]]
        Coefficient
Race_0    -0.411845
Race_1    -0.670684
Race_2    -0.894141
Race_3    -0.392075
Race_4    -0.621122
Race_5    -0.447537


In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
X = df.drop('HeartDisease', axis=1)  # All other columns are features
y = df['HeartDisease']  # 'HeartDisease' is the target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Create the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest.fit(X_train, y_train)

# Predict on the test data
y_pred = random_forest.predict(X_test)

# Evaluate the model's performance
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
import numpy as np
# Get the feature importances
importances = random_forest.feature_importances_

# Map these importances to the corresponding feature names
feature_names = np.array(df.drop('HeartDisease', axis=1).columns)
importance_dict = dict(zip(feature_names, importances))

# Convert to a pandas DataFrame for easy viewing
importance_df = pd.DataFrame.from_dict(importance_dict, orient='index', columns=['Importance'])

# Sort the DataFrame to show the most important features at the top
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(importance_df)