# Parameters Analysis

## Random Forest

In [99]:
# analyze the coefficients of each factor, get the most relevant factors
importance = rf.feature_importances_
features = X.columns

# create a dataframe
df_importance = pd.DataFrame({'features': features, 'importance': importance})

# sort the dataframe
df_importance = df_importance.sort_values('importance', ascending=False)

In [100]:
# use plotly
fig = px.bar(
    df_importance.head(20).sort_values(by='importance', ascending=True),
    x='importance',
    y='features',
    orientation='h',
    color='importance',
    color_continuous_scale='RdYlGn',  # Red to Green color scale
    range_color=[-1, 1],              # Define the range for color scaling
    text='importance'                  # Display importance values as text
)
fig.update_layout(
    height=600,
    width=800,
    title='Top 20 Feature Importances',
    xaxis_title='Importance',
    yaxis_title='Features',
    coloraxis_colorbar=dict(
        title="Importance",
        tickvals=[-1, 0, 1],
        ticktext=["-1", "0", "1"]
    )
)

# Customize text for better readability
fig.update_traces(
    texttemplate='%{text:.2f}',  # Format the text to two decimal places
    textposition='inside'       # Place the text outside the bars
)
fig.show()

In [101]:
# how the model predicts the risk of having a heart attack
df['HadHeartAttack'] = y
df['Risk'] = rf.predict_proba(X)[:, 1]
fig = px.histogram(df, x='Risk', color='HadHeartAttack', marginal='box', title='Risk Distribution')
# set height and width
fig.update_layout(height=600, width=800)
fig.show()

In [102]:
# Distribution of Target Variable Before and After Balancing
# put into same figure, before and after
# 'No' as Green, 'Yes' as Red, put into stack bar chart, horizontal, with two y options, 'before' and 'after'
before_counts = df['HadHeartAttack'].value_counts().reindex(['No', 'Yes']).fillna(0)
after_counts_sm = y_sm.value_counts().reindex(['No', 'Yes']).fillna(0)
after_counts_nm = y_nm.value_counts().reindex(['No', 'Yes']).fillna(0)
data = pd.DataFrame({
    'Status': ['NearMiss (y_nm)', 'SMOTE (y_sm)', 'Original (df)'],
    'No': [after_counts_nm.get('No', 0), after_counts_sm.get('No', 0), before_counts.get('No', 0)],
    'Yes': [after_counts_nm.get('Yes', 0), after_counts_sm.get('Yes', 0), before_counts.get('Yes', 0)],
})
data_melted = data.melt(id_vars='Status', var_name='HadHeartAttack', value_name='Count')
fig = px.bar(
    data_melted,
    y='Status',
    x='Count',
    color='HadHeartAttack',
    orientation='h',
    title='Distribution of Target Variable Before and After Balancing',
)
fig.update_layout(
    barmode='stack',
    xaxis_title='Count',
    yaxis_title='Dataset Status'
)
fig.show()

## SHAP Values

In [None]:
import fasttreeshap
import shap

explainer = fasttreeshap.TreeExplainer(rf, check_additivity=False, algorithm='v2')
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, plot_type='bar')

In [None]:
from pdpbox import pdp, info_plots

pdp_dist = pdp.PDPIsolate(model=rf, df=X_train.copy(), model_features=X_train.columns, feature='BMI', feature_name='BMI', n_classes=0, num_grid_points=50)

In [None]:
fig, axes = pdp_dist.plot(engine='matplotlib', plot_lines=True)

![BMI](BMI%20Risk%20Factoring.png)