In [4]:
import pandas as pd

In [6]:
df = pd.read_csv('heartdisease.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


In [8]:
#Data Cleaning
print("Missing Values",df.isnull().sum())

df.drop_duplicates(inplace=True)
print("Duplicate Values",df.duplicated().sum())

df['Ca'].fillna(df['Ca'].mean(),inplace=True)
df['Thal'].fillna(df['Thal'].mode()[0],inplace=True)

print("Data Shape after Cleaning:", df.shape)

Missing Values Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64
Duplicate Values 0
Data Shape after Cleaning: (303, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ca'].fillna(df['Ca'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Thal'].fillna(df['Thal'].mode()[0],inplace=True)


In [9]:
print(df.columns)
print(df.dtypes)

Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')
Unnamed: 0      int64
Age             int64
Sex             int64
ChestPain      object
RestBP          int64
Chol            int64
Fbs             int64
RestECG         int64
MaxHR           int64
ExAng           int64
Oldpeak       float64
Slope           int64
Ca            float64
Thal           object
AHD            object
dtype: object


In [10]:
#Data Integration
import pandas as pd

subset_df = df[['Age', 'Sex', 'ChestPain', 'AHD']].head(10)
additional_df = pd.DataFrame({
    'Patient_ID': range(1, 11),
    'BMI': [24.5, 27.8, 22.1, 30.2, 28.4, 26.7, 23.9, 25.3, 29.1, 24.8],
    'Smoking_History': ['No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes']
})

print(subset_df)
print(additional_df)

   Age  Sex     ChestPain  AHD
0   63    1       typical   No
1   67    1  asymptomatic  Yes
2   67    1  asymptomatic  Yes
3   37    1    nonanginal   No
4   41    0    nontypical   No
5   56    1    nontypical   No
6   62    0  asymptomatic  Yes
7   57    0  asymptomatic   No
8   63    1  asymptomatic  Yes
9   53    1  asymptomatic  Yes
   Patient_ID   BMI Smoking_History
0           1  24.5              No
1           2  27.8             Yes
2           3  22.1              No
3           4  30.2             Yes
4           5  28.4              No
5           6  26.7             Yes
6           7  23.9              No
7           8  25.3             Yes
8           9  29.1              No
9          10  24.8             Yes


In [11]:
subset_df['Patient_ID'] = range(1, 11)
merge_df=pd.merge(subset_df,additional_df,on='Patient_ID',how='inner')
print(merge_df)


   Age  Sex     ChestPain  AHD  Patient_ID   BMI Smoking_History
0   63    1       typical   No           1  24.5              No
1   67    1  asymptomatic  Yes           2  27.8             Yes
2   67    1  asymptomatic  Yes           3  22.1              No
3   37    1    nonanginal   No           4  30.2             Yes
4   41    0    nontypical   No           5  28.4              No
5   56    1    nontypical   No           6  26.7             Yes
6   62    0  asymptomatic  Yes           7  23.9              No
7   57    0  asymptomatic   No           8  25.3             Yes
8   63    1  asymptomatic  Yes           9  29.1              No
9   53    1  asymptomatic  Yes          10  24.8             Yes


In [12]:
#Data Transformation
import pandas as pd
from sklearn.preprocessing import StandardScaler

df= df.drop('Unnamed: 0', axis=1)

categorical_cols = ['ChestPain', 'Thal']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Shape after Encoding:", df_encoded.shape)
print(df_encoded.head())

Shape after Encoding: (303, 17)
   Age  Sex  RestBP  Chol  Fbs  RestECG  MaxHR  ExAng  Oldpeak  Slope   Ca  \
0   63    1     145   233    1        2    150      0      2.3      3  0.0   
1   67    1     160   286    0        2    108      1      1.5      2  3.0   
2   67    1     120   229    0        2    129      1      2.6      2  2.0   
3   37    1     130   250    0        0    187      0      3.5      3  0.0   
4   41    0     130   204    0        2    172      0      1.4      1  0.0   

   AHD  ChestPain_nonanginal  ChestPain_nontypical  ChestPain_typical  \
0   No                 False                 False               True   
1  Yes                 False                 False              False   
2  Yes                 False                 False              False   
3   No                  True                 False              False   
4   No                 False                  True              False   

   Thal_normal  Thal_reversable  
0        False            

In [13]:
df_encoded['AHD'] = df_encoded['AHD'].map({'Yes': 1, 'No': 0})

X = df_encoded.drop('AHD', axis=1)
y = df_encoded['AHD']

In [14]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("Data after Transformation:\n", X_scaled_df.head())

Data after Transformation:
         Age       Sex    RestBP      Chol       Fbs   RestECG     MaxHR  \
0  0.948726  0.686202  0.757525 -0.264900  2.394438  1.016684  0.017197   
1  1.392002  0.686202  1.611220  0.760415 -0.417635  1.016684 -1.821905   
2  1.392002  0.686202 -0.665300 -0.342283 -0.417635  1.016684 -0.902354   
3 -1.932564  0.686202 -0.096170  0.063974 -0.417635 -0.996749  1.637359   
4 -1.489288 -1.457296 -0.096170 -0.825922 -0.417635  1.016684  0.980537   

      ExAng   Oldpeak     Slope        Ca  ChestPain_nonanginal  \
0 -0.696631  1.087338  2.274579 -0.723095             -0.629534   
1  1.435481  0.397182  0.649113  2.503851             -0.629534   
2  1.435481  1.346147  0.649113  1.428203             -0.629534   
3 -0.696631  2.122573  2.274579 -0.723095              1.588476   
4 -0.696631  0.310912 -0.976352 -0.723095             -0.629534   

   ChestPain_nontypical  ChestPain_typical  Thal_normal  Thal_reversable  
0             -0.444554           3.489114 

In [15]:
#Error Correction
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(X_scaled_df))

threshold = 3
outlier_indices = np.where(z_scores > threshold)

X_scaled_df_cleaned = X_scaled_df[(z_scores < threshold).all(axis=1)]
y_cleaned = y[(z_scores < threshold).all(axis=1)]

print("Shape after Error Correction:", X_scaled_df_cleaned.shape)

Shape after Error Correction: (271, 16)


In [16]:
#DataModelBuilding

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


X_train, X_test, y_train, y_test = train_test_split(X_scaled_df_cleaned, y_cleaned, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:\n", conf_matrix)

Model Accuracy: 81.82%
Confusion Matrix:
 [[34  5]
 [ 5 11]]


In [17]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_scaled_df_cleaned, y_cleaned, cv=5)
print("Cross-validation accuracy scores:", scores)


Cross-validation accuracy scores: [0.81818182 0.88888889 0.81481481 0.83333333 0.77777778]
