In [None]:
# importing pandas
import pandas as pd

# importing seaborn
import seaborn as sns

# importing matplotlib
import matplotlib.pyplot as plt

# importing required pakages from sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Loading data

In [None]:
# Importing data using pandas into df
df = pd.read_csv("/content/heart_attacks.csv")

In [None]:
# checking first 5 rows of df
df.head()

In [None]:
# Shape of DF (rows,columns)
df.shape

In [None]:
# df columns datatypes
df.dtypes

In [None]:
# df columns
df.columns

# changing string types to numerical

In [None]:
# Convert 'Sex' column from categorical to numerical values
df['Sex'] = df['Sex'].replace({'female': 0, 'male': 1})


In [None]:
# Convert 'Chest_Pain_Type' column from categorical to numerical values
df['Chest_Pain_Type'] = df['Chest_Pain_Type'].replace({
    'typical angina': 1,
    'atypical angina': 2,
    'non-anginal pain': 3,
    'asymptomatic': 4
})

In [None]:
# Convert 'Resting_ECG_Results' column from categorical to numerical values
df['Resting_ECG_Results'] = df['Resting_ECG_Results'].replace({
    'normal': 0,
    'ST-T wave abnormality': 1,
    'left ventricular hypertrophy': 2
})

In [None]:
# Convert 'Exercise_Induced_Angina' column from categorical to numerical values
df['Exercise_Induced_Angina'] = df['Exercise_Induced_Angina'].replace({'no': 0, 'yes': 1})


In [None]:
# Convert 'Slope' column from categorical to numerical values
df['Slope'] = df['Slope'].replace({'upsloping': 1, 'flat': 2, 'downsloping': 3})


In [None]:
# Convert 'Thalassemia' column from categorical to numerical values
df['Thalassemia'] = df['Thalassemia'].replace({
    'normal': 1,
    'fixed defect': 2,
    'reversible defect': 3
})

In [None]:
# Convert 'Heart_Attack_Prediction' column from categorical to numerical values
df['Heart_Attack_Prediction'] = df['Heart_Attack_Prediction'].replace({'less chance of heart attack': 0, 'more chance of heart attack': 1})

In [None]:
df.head()

In [None]:
# Replace null values with the median value of the respective column
df = df.fillna(df.median())


# checking null values

In [None]:
df.isnull().sum()

In [None]:
# describing data
df.describe()

# data visualization

### using all columns

In [None]:
# Create a histogram of the 'Max_Heart_Rate_Achieved' column
sns.histplot(data=df, x='Max_Heart_Rate_Achieved')
plt.title('Histogram of Max Heart Rate Achieved')
plt.xlabel('Max Heart Rate Achieved')
plt.ylabel('Count')
plt.show()

In [None]:
# Create a scatter plot of the 'Cholesterol' vs 'Age' columns
sns.scatterplot(data=df, x='Cholesterol', y='Age', hue='Heart_Attack_Prediction', palette=['blue', 'red'])
plt.title('Scatter Plot of Cholesterol vs Age')
plt.xlabel('Cholesterol')
plt.ylabel('Age')
plt.show()

In [None]:
column_names = list(df.columns)

df_subset = df[column_names]

# Calculate the correlation matrix
corr_matrix = df_subset.corr()

# Create the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

#### To check relation between each column

In [None]:
# Create visualizations for all columns using Seaborn
sns.pairplot(df, hue='Heart_Attack_Prediction', palette='plasma')
plt.show()

# Categorical columns and numerical columns from data

In [None]:
# Select the categorical columns
cat_cols = ['Sex', 'Chest_Pain_Type', 'Fasting_Blood_Sugar', 'Resting_ECG_Results', 'Exercise_Induced_Angina', 'Slope', 'Thalassemia']

# Perform one-hot encoding with drop_first=True
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [None]:
# Select the numerical columns
num_cols = ['Age', 'Resting_Blood_Pressure', 'Cholesterol', 'Max_Heart_Rate_Achieved', 'Old_Peak', 'Num_Major_Vessels']

# Scale the numerical columns
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])


# Splitting data

In [None]:
# Select the feature columns and target column
X = df_encoded.drop('Heart_Attack_Prediction', axis=1)
y = df_encoded['Heart_Attack_Prediction']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Logistic Regression

In [None]:
# Train a logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)


In [None]:
# Calculate the accuracy of the model
accuracy_logistic_regression = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_logistic_regression}")

In [None]:
# Plot the confusion matrix using a heatmap
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Support vector machine

In [None]:
# Split the data into features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Fit the SVM model on the training data
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

# Predict on the testing data
y_pred = svm.predict(X_test)

In [None]:
# Print the classification report 
print(classification_report(y_test, y_pred))


In [None]:
# Calculate the accuracy of the model
accuracy_SVM = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SVM }")

In [None]:
# Plot the confusion matrix using a heatmap
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Random Forest 

In [None]:
# Select the feature columns and target column
X = df_encoded.drop('Heart_Attack_Prediction', axis=1)
y = df_encoded['Heart_Attack_Prediction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Random Forest classifier
rfc = RandomForestClassifier(n_estimators=100
                             ,random_state=42)
rfc.fit(X_train, y_train)


In [None]:
# Make predictions on the test set and evaluate the model's accuracy
y_pred = rfc.predict(X_test)
accuracy_Randomforest = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_Randomforest}")

In [None]:
# Plot the confusion matrix using a heatmap
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# cross validation

In [None]:
# Perform cross-validation for logistic regression
log_reg_scores = cross_val_score(log_reg, X, y, cv=10)
print("Logistic Regression Mean Accuracy in Cross validation :", round(log_reg_scores.mean(),3))


# Plot the cross-validation scores
plt.plot(range(1, 11), log_reg_scores)
plt.xlabel('Number of Folds')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Scores for logistic regression')
plt.show()

In [None]:
# Perform cross-validation for SVM
svm_scores = cross_val_score(svm, X, y, cv=10)
print("SVM Mean Accuracy in Cross validation :", round(svm_scores.mean(),3))

# Plot the cross-validation scores
plt.plot(range(1, 11), svm_scores)
plt.xlabel('Number of Folds')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Scores for SVM')
plt.show()

In [None]:
# Perform 10-fold cross validation
rfc_scores = cross_val_score(rfc, X, y, cv=10)
print("Random Forest Mean Accuracy in Cross validation : " ,round(rfc_scores.mean(),3))


# Plot the cross-validation scores
plt.plot(range(1, 11), rfc_scores)
plt.xlabel('Number of Folds')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Scores for Randon forest')
plt.show()

# Accuracy Scores

In [None]:

# Create a list of model names and their accuracy scores
models = ['Random Forest', 'SVM', 'Logistic Regression']
accuracy_scores = [accuracy_Randomforest, accuracy_SVM, accuracy_logistic_regression]

# Create a bar plot to compare the accuracy scores
plt.bar(models, accuracy_scores)
plt.title('Accuracy Scores of ML Models')
plt.xlabel('Models')
plt.ylabel('Accuracy')
for i, v in enumerate(accuracy_scores):
    plt.text(i, v/2, str(round(v, 3)), ha='center', va='bottom')

plt.show()

### Based on the accuracy scores, the best model for the Heart Attack Prediction is Logistic Regression, which has an accuracy of 0.902.

# cross validation score

In [None]:
# Create a list of model names and their scores
models = ['Random Forest', 'SVM', 'Logistic Regression']
cr_v_scores = [rfc_scores, svm_scores, log_reg_scores]

# Create a box plot to compare the scores
plt.boxplot(cr_v_scores)
plt.xticks(range(1, len(models) + 1), models)
plt.title('Cross validation Scores of ML Models')
plt.xlabel('Models')
plt.ylabel('Cross Validation Scores')
plt.show()
