In [6]:
import pandas as pd
import numpy as np
from scipy import stats



In [7]:
df=pd.read_csv("heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [8]:
numerical_df = df.select_dtypes(include=[np.number])
categorical_df = df.select_dtypes(exclude=[np.number])
z_scores = np.abs(stats.zscore(numerical_df))

# Set the threshold
threshold = 3

# Filter out the outliers
filtered_entries = (z_scores < threshold).all(axis=1)
numerical_df_filtered = numerical_df[filtered_entries]

# Combine the filtered numerical data with the categorical data
df_filtered = pd.concat([numerical_df_filtered, categorical_df[filtered_entries]], axis=1)

print(df_filtered)


     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease Sex  \
0     40        140          289          0    172      0.0             0   M   
1     49        160          180          0    156      1.0             1   F   
2     37        130          283          0     98      0.0             0   M   
3     48        138          214          0    108      1.5             1   F   
4     54        150          195          0    122      0.0             0   M   
..   ...        ...          ...        ...    ...      ...           ...  ..   
913   45        110          264          0    132      1.2             1   M   
914   68        144          193          1    141      3.4             1   M   
915   57        130          131          0    115      1.2             1   M   
916   57        130          236          0    174      0.0             1   F   
917   38        138          175          0    173      0.0             0   M   

    ChestPainType RestingEC

In [9]:
#since the below features are ordinal we can apply label encoder or just do what we did below
df1 = df.copy()
df1.ExerciseAngina.replace(
    {
        'N': 0,
        'Y': 1
    },
    inplace=True)

df1.ST_Slope.replace(
    {
        'Down': 1,
        'Flat': 2,
        'Up': 3
    },
    inplace=True
)

df1.RestingECG.replace(
    {
        'Normal': 1,
        'ST': 2,
        'LVH': 3
    },
    inplace=True)

df1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [10]:
#now sex and chestpain are nominal we apply
df3=pd.get_dummies(df1,dtype=int,drop_first=True)
df3

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,1,132,0,1.2,2,1,1,0,0,1
914,68,144,193,1,1,141,0,3.4,2,1,1,0,0,0
915,57,130,131,0,1,115,1,1.2,2,1,1,0,0,0
916,57,130,236,0,3,174,0,0.0,2,1,0,1,0,0


In [11]:
from sklearn.preprocessing import MinMaxScaler
x=df3.drop('HeartDisease',axis=1)
y=df3.HeartDisease
scaler=MinMaxScaler()
x_scaled=scaler.fit_transform(x)
x_scaled

array([[0.24489796, 0.7       , 0.47927032, ..., 1.        , 0.        ,
        0.        ],
       [0.42857143, 0.8       , 0.29850746, ..., 0.        , 1.        ,
        0.        ],
       [0.18367347, 0.65      , 0.46932007, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.59183673, 0.65      , 0.2172471 , ..., 0.        , 0.        ,
        0.        ],
       [0.59183673, 0.65      , 0.39137645, ..., 1.        , 0.        ,
        0.        ],
       [0.20408163, 0.69      , 0.29021559, ..., 0.        , 1.        ,
        0.        ]])

In [12]:
from sklearn.model_selection import train_test_split
train_x_scaled,test_x_scaled,train_y,test_y=train_test_split(x_scaled,y,test_size=0.3,random_state=56)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
model_params={
    'logistic_regression':{
        'model':LogisticRegression(),
        'params':{
            'C':[1,5,10]
    }
},
    'RANDOM_FORREST':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
                 }
},

     'Decision_Trees':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy']
                 }
}
}

In [16]:
from sklearn.model_selection import GridSearchCV
scores=[]
for model_name,para in model_params.items():
    clf=GridSearchCV(para['model'],para['params'])
    clf.fit(train_x_scaled,train_y)
    scores.append(
        {
         'model_name':model_name,
         'best_score': clf.best_score_,
         'best_params': clf.best_params_
        }
    )
di = pd.DataFrame(scores,columns=['model_name','best_score','best_params'])
di

Unnamed: 0,model_name,best_score,best_params
0,logistic_regression,0.85516,{'C': 5}
1,RANDOM_FORREST,0.847347,{'n_estimators': 10}
2,Decision_Trees,0.803731,{'criterion': 'gini'}


In [None]:
from pycaret.classification import *
# Initialize setup
clf1 = setup(df3, target='HeartDisease', silent=True, html=False)

# Compare models
best_model = compare_models(clf1)

# Display the best model
best_model

In [69]:
pip install --upgrade scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.5.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m142.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 2.2.0
    Uninstalling threadpoolctl-2.2.0:
      Successfully uninstalled threadpoolctl-2.2.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.2
    Uninstalling scikit-learn-1.4.2:
      Successfully uninstalled scikit-learn-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all t

In [18]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(x)
X_pca

array([[ 92.31079119,  29.44316816],
       [-17.14409666,  13.7394731 ],
       [ 81.90733808, -38.22442746],
       ...,
       [-69.0041927 , -17.33216411],
       [ 39.2077941 ,  33.59642907],
       [-21.43805605,  37.21419015]])

In [19]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [20]:
from sklearn.model_selection import GridSearchCV
scores=[]
for model_name,para in model_params.items():
    clf=GridSearchCV(para['model'],para['params'])
    clf.fit(X_train_pca,y_train)
    scores.append(
        {
         'model_name':model_name,
         'best_score': clf.best_score_,
         'best_params': clf.best_params_
        }
    )
di = pd.DataFrame(scores,columns=['model_name','best_score','best_params'])
di

Unnamed: 0,model_name,best_score,best_params
0,logistic_regression,0.683944,{'C': 1}
1,RANDOM_FORREST,0.666191,{'n_estimators': 10}
2,Decision_Trees,0.627994,{'criterion': 'gini'}


In [None]:
1 0 0
0 1 0
0 0 0