<a href="https://colab.research.google.com/github/Pali29/Heart_Disease_ML_model/blob/main/Heart_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Handling**

## **Imports**

In [112]:
from google.colab import files
import pandas as pd
import seaborn as sns
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/Pali29/ML-datasets/main/heart.csv")

## **Data Cleaning**

### **Missing Values and Duplicates**

In [113]:
new_df = df.sort_values(by='Age')
new_df = new_df.replace({'RestingBP':0,'Cholesterol':0},np.NaN)

In [114]:
new_df = new_df.interpolate(nethod="Age")
new_df = new_df.drop_duplicates()
#sns.boxplot(new_df['Cholesterol'])


### **Outliers**

In [115]:
processed_df = new_df[new_df['Cholesterol']<=400]
processed_df = processed_df[processed_df['Cholesterol']>=110]
#sns.boxplot(new_df['RestingBP'])

In [116]:
processed_df = processed_df[processed_df['RestingBP']<=170]
processed_df = processed_df[processed_df['RestingBP']>=90]
#sns.boxplot(processed_df['MaxHR'])
processed_df = processed_df[processed_df['MaxHR']>=70]

# **Model Design**

## **Dividing**

In [117]:
Y = processed_df["HeartDisease"]
X = processed_df.drop("HeartDisease", axis=1)

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalfeatures = ["ChestPainType", "Sex", "RestingECG", "ExerciseAngina", "ST_Slope",]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("One_Hot",one_hot,categoricalfeatures)],remainder="passthrough")
transformed_X = transformer.fit_transform(X)

## **Splitting Into Test and Training**

In [118]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(transformed_X, Y, test_size=0.2, random_state=65)

## **Regression**

### **Linear Regression**

In [119]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [120]:
Y_lr_train_pred = lr.predict(X_train)
Y_lr_test_pred = lr.predict(X_test)

### **Random Forest**

In [121]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=3)
rf.fit(X_train, Y_train)

In [122]:
Y_rf_train_pred = rf.predict(X_train)
Y_rf_test_pred = rf.predict(X_test)

## **Evaluate Performance**

In [123]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(Y_train, Y_lr_train_pred)
lr_train_r2 = r2_score(Y_train, Y_lr_train_pred)

lr_test_mse = mean_squared_error(Y_test, Y_lr_test_pred)
lr_test_r2 = r2_score(Y_test, Y_lr_test_pred)

In [124]:
rf_train_mse = mean_squared_error(Y_train, Y_rf_train_pred)
rf_train_r2 = r2_score(Y_train, Y_rf_train_pred)

rf_test_mse = mean_squared_error(Y_test, Y_rf_test_pred)
rf_test_r2 = r2_score(Y_test, Y_rf_test_pred)

# **Results**

In [125]:
lr_results = pd.DataFrame(["Linear Regression", lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
rf_results = pd.DataFrame(["Random Forest", rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
lr_results.columns = ["Method", "MSE_Training", "R2_Training", "MSE_Test", "R2_Test"]
rf_results.columns = ["Method", "MSE_Training", "R2_Training", "MSE_Test", "R2_Test"]

result = pd.concat([lr_results, rf_results], axis=0)
result.reset_index()
result

Unnamed: 0,Method,MSE_Training,R2_Training,MSE_Test,R2_Test
0,Linear Regression,0.105447,0.571673,0.107143,0.571427
0,Random Forest,0.098051,0.601714,0.09718,0.611279
