<a href="https://colab.research.google.com/github/PuramaNandana/_AI-Powered-Application-for-Early-Detection-of-Heart-Disease-Risk--/blob/main/_AI_Powered_Application_for_Early_Detection_of_Heart_Disease_Risk_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# run this first cell (Colab already has these, but safe to run)
!pip install --quiet pandas scikit-learn matplotlib seaborn joblib



Imports & helper variables

In [None]:
import os
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib      # for saving scaler



Upload dataset to Colab

In [None]:
from google.colab import files
uploaded = files.upload()               # choose heart.csv from your machine
fname = list(uploaded.keys())[0]
print("Uploaded:", fname)

# load the CSV into a dataframe
df = pd.read_csv(io.BytesIO(uploaded[fname]))
print("Loaded", fname, "-> shape:", df.shape)


Saving heart.csv to heart.csv
Uploaded: heart.csv
Loaded heart.csv -> shape: (1025, 14)


see columns, dtypes, missing counts

In [None]:
print("Columns:", df.columns.tolist())
display(df.head())
print("\nInfo:")
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())


Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB

Missing values per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0

extracted data from reports

In [None]:
import pandas as pd

# Load from CSV (replace with your path, e.g., 'heart.csv')
df = pd.read_csv("heart.csv")

print("First 5 rows:")
print(df.head())
print("Shape:", df.shape)



First 5 rows:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  
Shape: (1025, 14)


Handle Missing / Noisy Values

In [None]:

# Step 2: Handle Missing / Noisy Values
import numpy as np

print("Missing values before:\n", df.isnull().sum())

# Numeric → fill with mean
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# Categorical → fill with mode
for col in df.select_dtypes(exclude=np.number).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("Missing values after:\n", df.isnull().sum())


Missing values before:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
Missing values after:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


 Normalize & Scale Features

In [None]:

from sklearn.preprocessing import StandardScaler

X = df.drop("target", axis=1)  # features
y = df["target"]               # labels

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Before Scaling:", X.iloc[0].values)
print("After Scaling: ", X_scaled[0])


Before Scaling: [ 52.   1.   0. 125. 212.   0.   1. 168.   0.   1.   2.   2.   3.]
After Scaling:  [-0.26843658  0.66150409 -0.91575542 -0.37763552 -0.65933209 -0.41887792
  0.89125488  0.82132052 -0.71228712 -0.06088839  0.99543334  1.20922066
  1.08985168]


Step 4: Split into Train / Validation / Test Sets


In [None]:

from sklearn.model_selection import train_test_split

# 80% train, 20% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Split 20% temp → 10% val, 10% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)



Train: (820, 13) (820,)
Validation: (102, 13) (102,)
Test: (103, 13) (103,)


In [None]:
df=pd.read_csv('/content/heart.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0
