**APPENDIX**

**IMPORTING LIBRARIES**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

**DATA COLLECTION AND PROCESSING**

In [None]:
data=pd.read_csv('/content/Heart_attack_prediction_.csv')

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.isna().sum()

**DATA VISUALISATION**

In [None]:
##histogram

In [None]:
def plot_age_histogram(data):
    plt.figure(figsize=(8, 5))
    sns.histplot(data['age'].dropna(), bins=30, kde=True, color='skyblue')
    plt.title('Age Distribution of Patients')
    plt.xlabel('Age')
    plt.ylabel('Number of Patients')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
plot_age_histogram(data)

In [None]:
def plot_cholesterol_level_histogram(data):
    plt.figure(figsize=(8, 5))
    sns.histplot(data['cholesterol_level'].dropna(), bins=30, kde=True, color='skyblue')
    plt.title('cholesterol_level Distribution of Patients')
    plt.xlabel('cholesterol_level')
    plt.ylabel('Frequency')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
plot_cholesterol_level_histogram(data)

In [None]:
def plot_fasting_blood_sugar_histogram(data):
    plt.figure(figsize=(8, 5))
    sns.histplot(data['fasting_blood_sugar'].dropna(), bins=30, kde=True, color='skyblue')
    plt.title('fasting_blood_sugar Distribution of Patients')
    plt.xlabel('fasting_blood_sugar')
    plt.ylabel('Frequency')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
plot_fasting_blood_sugar_histogram(data)

In [None]:
#pie chart

In [None]:
heart_attack_gender = data[data['heart_attack'] == 'Yes']['gender'].value_counts()
plt.figure(figsize=(5, 5))
heart_attack_gender.plot.pie(
    autopct='%1.1f%%',
    startangle=90,
    labels=heart_attack_gender.index,
    colors=['#87CEEB', '#FFB6C1']
)
plt.title('Gender Distribution Among Heart Attack Patients')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
heart_attack_df = data[data['heart_attack'] == 'Yes']

# Count the frequency of each stress level among heart attack patients
physical_activity_counts = heart_attack_df['physical_activity'].value_counts()

# Plot the pie chart
plt.figure(figsize=(6, 6))
physical_activity_counts.plot.pie(
    autopct='%1.1f%%',
    startangle=90,
    labels=physical_activity_counts.index,
    colors=sns.color_palette('Pastel1'),
    wedgeprops={'edgecolor': 'black'}
)
plt.title('physical_activity_Level Distribution Among Heart Attack Patients')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
#boxplot

In [None]:
sns.boxplot(data=data, x='heart_attack', y='age', palette='pastel')
plt.title('Age Distribution by Heart Attack Status')
plt.xlabel('Heart Attack')
plt.ylabel('Age')
plt.show()

In [None]:
sns.boxplot(data)
plt.xticks(rotation=90)
plt.show()

In [None]:
#countplot

In [None]:
sns.countplot(data=data, x='hypertension', hue='heart_attack', palette='viridis')
plt.title('Hypertension vs Heart Attack')
plt.xlabel('Hypertension')
plt.ylabel('Count')
plt.grid(linestyle=':',color='red')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(data=data, x='previous_heart_disease', hue='heart_attack', palette='viridis')
plt.title('previous_heart_disease vs heart_attack')
plt.xlabel('previous_heart_disease')
plt.ylabel('Count')
plt.grid(linestyle=':',color='red')
plt.tight_layout()
plt.show()

In [None]:
#donut chart

In [None]:
diabetic_df = data[data['diabetes'] == 'Yes']

# Count heart attack outcomes among diabetics
diabetic_heart_attack_counts = diabetic_df['heart_attack'].value_counts()

# Plotting the donut chart
fig, ax = plt.subplots(figsize=(6, 6))
wedges, texts, autotexts = ax.pie(
    diabetic_heart_attack_counts,
    labels=diabetic_heart_attack_counts.index,
    autopct='%1.1f%%',
    startangle=90,
    wedgeprops=dict(width=0.4)
)
ax.set_title('Heart Attack Distribution Among Diabetic Individuals')
plt.show()

**PRE PROCESSING**

**HANDLING MISSING VALUES(using normal levels)**

In [None]:
## Handle missing values (fill with normal levels)

In [None]:
normal_values ={
    'fasting_blood_sugar': 90,
    'cholesterol_ldl':100
}
data.fillna(normal_values,inplace=True)

In [None]:
data.isna.sum()

**DROPPING UNIMPORTANT FEATURES**

In [None]:
data=data.drop(columns=["region","income_level","waist_circumference","family_history","participated_in_free_screening"])

In [None]:
data

**ENCODING**

**LABEL ENCODING(BINARY COLUMNS)**

In [None]:
from sklearn.preprocessing import LabelEncoder
binary_cols = ['diabetes','hypertension','obesity','dietary_habits','EKG_results','previous_heart_disease','medication_usage']
le = LabelEncoder()

for col in binary_cols:
    data[col] = le.fit_transform(data[col])

In [None]:
data

In [None]:
data['heart_attack']=data['heart_attack'].map({'No':0,'Yes':1})

In [None]:
data['gender']=data['gender'].map({'Male':0,'Female':1})

**ORDINAL ENCODING(FOR ORDERED COLUMNS)**

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ordinal_cols = {"smoking_status": ["Never", "Past", "Current"],
    "alcohol_consumption": ["No", "Moderate", "High"],
    "physical_activity": ["Low", "Moderate", "High"],
    "air_pollution_exposure": ["Low", "Moderate", "High"],
    "stress_level": ["Low", "Moderate", "High"]}

for col, order in ordinal_cols.items():
    oe = OrdinalEncoder(categories=[order])
    data[col] = oe.fit_transform(data[[col]])

In [None]:
data

**SCALING**

In [None]:
from sklearn.preprocessing import RobustScaler

num_cols = ['sleep_hours','age','cholesterol_level','blood_pressure_diastolic','fasting_blood_sugar','blood_pressure_systolic','cholesterol_hdl','cholesterol_ldl','triglycerides']
# Apply Robust Scaling
scaler = RobustScaler()
data= data.copy()
data[num_cols] = scaler.fit_transform(data[num_cols])

# Show results
print(data.head())

**BALANCED OR NOT?**

In [None]:
total_samples = len(data)
class_percentages = data['heart_attack'].value_counts(normalize=True) * 100
print(class_percentages)

In [None]:
##Fairly balanced

In [None]:
x= data.drop(columns=["heart_attack"])
y = data["heart_attack"]

**SPLITTING THE DATASET**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

**MODELLING**

**LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,ConfusionMatrixDisplay
print('confusion_matrix=',confusion_matrix(y_test,y_pred))
print('Accuracy=',accuracy_score(y_test,y_pred))

In [None]:
disp=ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred))
disp.plot()
plt.show()

**DECISION TREE CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier
model1=DecisionTreeClassifier(max_depth=10, min_samples_split=20, min_samples_leaf=20, random_state=42, class_weight='balanced',criterion='entropy')
model1.fit(x_train,y_train)

In [None]:
y_pred_1  = model1.predict(x_test)
y_pred_1

In [None]:
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay
print('confusion_matrix=',confusion_matrix(y_test,y_pred_1))
accuracy_score(y_test,y_pred_1)

In [None]:
disp=ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred_1))
disp.plot()
plt.show()

**RANDOM FOREST CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model2=RandomForestClassifier()
model2.fit(x_train,y_train)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300,400,500],
    'max_depth': [None,5,10,15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

In [None]:
y_pred2=model2.predict(x_test)
y_pred2

In [None]:
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay
print('confusion_matrix=',confusion_matrix(y_test,y_pred2))
print("Accuracy Score:", accuracy_score(y_test, y_pred2))


In [None]:
disp=ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred2))
disp.plot()
plt.show()

**XG BOOST**

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=10)
xgb_model.fit(x_train,y_train)

In [None]:
y_pred3 = xgb_model.predict(x_test)
y_pred3

In [None]:
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay
print('confusion_matrix=',confusion_matrix(y_test,y_pred3))
print("Accuracy:", accuracy_score(y_test, y_pred3))

In [None]:
disp=ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred3))
disp.plot()
plt.show()

**CAT BOOST**

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
model4 = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=100)

In [None]:
model4.fit(x_train,y_train)

In [None]:
y_pred4 =model4.predict(x_test)
y_pred4

In [None]:
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay
print('confusion_matrix=',confusion_matrix(y_test,y_pred4))
print("Accuracy:", accuracy_score(y_test, y_pred4))

In [None]:
disp=ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred4))
disp.plot()
plt.show()

**COMPARING ACCURACY OF DIFFERENT MODELS**

In [None]:
alg=['LogisticRegression','DecisionTree','RandomForest','XGBoost','CatBoost']
acc=[accuracy_log,accuracy_dec,accuracy_ran,accuracy_xg,accuracy_cat]
Accuracy_scores['Accuracy']=Accuracy_scores['Accuracy']*100
Accuracy_Scores

**PREDICTION**

In [None]:
import pickle
# Save the model
with open("model4.pkl", "wb") as file:
    pickle.dump(model4,file)
model4

In [None]:
sample_input = x.iloc[[158353]]
prediction = model4.predict(sample_input)
print(prediction)

In [None]:
sample_input = x.iloc[[1]]
prediction = model4.predict(sample_input)
print(prediction)