<a href="https://colab.research.google.com/github/PRIYANSHUJAINJECRC/Heart_disease_prediction/blob/main/HeartDisease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ðŸŽ¯ Project Objective**
## Predict whether a patient has heart disease based on medical attributes, using machine learning, while minimizing medically dangerous errors.

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


# **Problem Statement**
## The objective of this project is to predict the presence of heart disease in patients based on clinical and demographic features using machine learning techniques. The focus is on building a reliable classification model with appropriate evaluation metrics, as false negatives in medical diagnosis can be critical.

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

# metadata
print(heart_disease.metadata)

# variable information
print(heart_disease.variables)
print("Features (X):")
print(X.head())
print("\nTargets (y):")
print(y.head())


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. JÃ¡nosi, W. Steinbrunn, 

In [4]:
###Task B
print(y.value_counts())
y_binary=(y>0).astype(int)
print(y_binary.value_counts())


num
0      164
1       55
2       36
3       35
4       13
Name: count, dtype: int64
num
0      164
1      139
Name: count, dtype: int64


In [5]:
import pandas as pd

# Rename the 'num' column in y_binary to 'target'
target_df = y_binary.rename(columns={'num': 'target'})

# Concatenate X and the correctly named target_df
df = pd.concat([X, target_df], axis=1)

print(df.head())
print(df.shape)
print(df.columns)
print(df.isnull().sum())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target  
0  0.0   6.0       0  
1  3.0   3.0       1  
2  2.0   7.0       1  
3  0.0   3.0       0  
4  0.0   3.0       0  
(303, 14)
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int

In [6]:
print(df['target'].value_counts(normalize=True)*100)
print(df.describe())
print(df.groupby('target')['age'].mean())
print(df.groupby('target')['chol'].mean())
print(df.groupby('target')['trestbps'].mean())
print(df.groupby('target')['thalach'].mean())
print(df.groupby('target')['thalach'].mean())
print(df.groupby('target')['sex'].mean())
print(df.groupby('target')['cp'].mean())
print(df.groupby('target')['thal'].mean())
print(df.groupby('target')['ca'].mean())
##Sex is the most related feature


target
0    54.125413
1    45.874587
Name: proportion, dtype: float64
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.438944    0.679868    3.158416  131.689769  246.693069    0.148515   
std      9.038662    0.467299    0.960126   17.599748   51.776918    0.356198   
min     29.000000    0.000000    1.000000   94.000000  126.000000    0.000000   
25%     48.000000    0.000000    3.000000  120.000000  211.000000    0.000000   
50%     56.000000    1.000000    3.000000  130.000000  241.000000    0.000000   
75%     61.000000    1.000000    4.000000  140.000000  275.000000    0.000000   
max     77.000000    1.000000    4.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       slope          ca  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  299.000000   
mean     0.990099  149.607261    0.326

In [18]:
##Task D
df['AgeGroup']=pd.cut(df['age'],bins=[0,11,21,df['age'].max() + 1],labels=['Child','Adult','Senior'], right=False)
df['CholesterolRisk']=(df['chol']>=240).astype(int)
df['BloodPressureRisk']=(df['trestbps']>=140).astype(int)
df['exercise_related_heart_stress'] = df['oldpeak'] / df['thalach']

# Impute missing values in 'ca' with its median
df['ca'].fillna(df['ca'].median(), inplace=True)

# Impute missing values in 'thal' with its mode
df['thal'].fillna(df['thal'].mode()[0], inplace=True)

# Handle any remaining missing AgeGroup values by filling with the most frequent category
df['AgeGroup'].fillna(df['AgeGroup'].mode()[0], inplace=True)

print(df[['oldpeak', 'thalach', 'exercise_related_heart_stress']].head())
print(df.head())
print(df.groupby('target')['AgeGroup'].value_counts())
print(df.groupby('target')['CholesterolRisk'].value_counts())
print(df.groupby('target')['BloodPressureRisk'].value_counts())
print(df.groupby('target')['exercise_related_heart_stress'].value_counts())
print("Missing values after imputation:")
print(df.isnull().sum())

   oldpeak  thalach  exercise_related_heart_stress
0      2.3      150                       0.015333
1      1.5      108                       0.013889
2      2.6      129                       0.020155
3      3.5      187                       0.018717
4      1.4      172                       0.008140
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target AgeGroup  CholesterolRisk  BloodPressureRisk  \
0  0.0   6.0       0   Senior                0                  1   
1  3.0   3.0       1   Senior                1                  1   


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ca'].fillna(df['ca'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['thal'].fillna(df['thal'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

In [21]:
y=df['target']
x=df.drop(columns=['target'])
num_col=x.select_dtypes(include=['int64','float64']).columns
cat_col=x.select_dtypes(include=['object','category']).columns
print(num_col)
print(cat_col)
print(x)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'CholesterolRisk',
       'BloodPressureRisk', 'exercise_related_heart_stress'],
      dtype='object')
Index(['AgeGroup'], dtype='object')
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0    

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score,recall_score,f1_score
import matplotlib.pyplot as plt
import seaborn as sns

model=Pipeline(steps=[('preprocessor',ColumnTransformer(transformers=[('num',StandardScaler(),num_col),('cat',OneHotEncoder(),cat_col)])),('classifier',LogisticRegression(max_iter=1000))])
x_train,x_test,y_train,y_test=train_test_split(x,y_binary.squeeze(),test_size=0.2,random_state=42)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

0.8360655737704918
[[24  5]
 [ 5 27]]
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.84      0.84      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

0.84375
0.84375


In [23]:
model_=Pipeline(steps=[('preprocessor',ColumnTransformer(transformers=[('num',StandardScaler(),num_col),('cat',OneHotEncoder(),cat_col)])),('classifier',LogisticRegression(max_iter=1000,class_weight='balanced'))])
x_train,x_test,y_train,y_test=train_test_split(x,y_binary.squeeze(),test_size=0.2,random_state=42)
model_.fit(x_train,y_train)
y_pred_=model_.predict(x_test)
print(accuracy_score(y_test,y_pred_))
print(confusion_matrix(y_test,y_pred_))
print(classification_report(y_test,y_pred_))
print(precision_score(y_test,y_pred_))
print(recall_score(y_test,y_pred_))

0.8688524590163934
[[24  5]
 [ 3 29]]
              precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

0.8529411764705882
0.90625


In [24]:
print(model_.predict_proba(x_test))


[[0.20706479 0.79293521]
 [0.1818227  0.8181773 ]
 [0.16846323 0.83153677]
 [0.47292999 0.52707001]
 [0.28947191 0.71052809]
 [0.14827058 0.85172942]
 [0.02308793 0.97691207]
 [0.00341159 0.99658841]
 [0.71439481 0.28560519]
 [0.36291422 0.63708578]
 [0.93548184 0.06451816]
 [0.98044062 0.01955938]
 [0.18939243 0.81060757]
 [0.01202348 0.98797652]
 [0.00338229 0.99661771]
 [0.91702913 0.08297087]
 [0.96072845 0.03927155]
 [0.28066015 0.71933985]
 [0.00115036 0.99884964]
 [0.88376076 0.11623924]
 [0.1764381  0.8235619 ]
 [0.92168307 0.07831693]
 [0.00527858 0.99472142]
 [0.94071779 0.05928221]
 [0.00514709 0.99485291]
 [0.96951245 0.03048755]
 [0.6211149  0.3788851 ]
 [0.10228289 0.89771711]
 [0.08859678 0.91140322]
 [0.45596205 0.54403795]
 [0.7601539  0.2398461 ]
 [0.35593484 0.64406516]
 [0.97824056 0.02175944]
 [0.53649954 0.46350046]
 [0.78465069 0.21534931]
 [0.27932138 0.72067862]
 [0.01569259 0.98430741]
 [0.41428403 0.58571597]
 [0.0069483  0.9930517 ]
 [0.80611025 0.19388975]


In [26]:
positive_class_probabilities = model_.predict_proba(x_test)[:, 1]
print("Probabilities for the positive class:")
print(positive_class_probabilities)

Probabilities for the positive class:
[0.79293521 0.8181773  0.83153677 0.52707001 0.71052809 0.85172942
 0.97691207 0.99658841 0.28560519 0.63708578 0.06451816 0.01955938
 0.81060757 0.98797652 0.99661771 0.08297087 0.03927155 0.71933985
 0.99884964 0.11623924 0.8235619  0.07831693 0.99472142 0.05928221
 0.99485291 0.03048755 0.3788851  0.89771711 0.91140322 0.54403795
 0.2398461  0.64406516 0.02175944 0.46350046 0.21534931 0.72067862
 0.98430741 0.58571597 0.9930517  0.19388975 0.98700503 0.30683987
 0.9117747  0.04762202 0.26895359 0.94131036 0.12430773 0.17941468
 0.85233517 0.841808   0.33804159 0.02620832 0.0262289  0.1781316
 0.9590998  0.20212201 0.04709275 0.97560795 0.92995536 0.95279634
 0.15667743]


In [29]:
custom_threshold = 0.4
custom_predictions = (positive_class_probabilities >= custom_threshold).astype(int)

print(f"Predictions using a custom threshold of {custom_threshold}:")
print(custom_predictions)

# You can also evaluate these custom predictions against y_test if needed
# from sklearn.metrics import classification_report
# print(classification_report(y_test, custom_predictions))

Predictions using a custom threshold of 0.4:
[1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1
 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0]
