In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [16]:
df=pd.read_csv('/content/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [18]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [19]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [20]:
df.shape

(918, 12)

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le=LabelEncoder()
df['Sex']=le.fit_transform(df['Sex'])
df['ChestPainType']=le.fit_transform(df['ChestPainType'])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int64  
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(8), object(3)
memory usage: 86.2+ KB


In [24]:
df['RestingECG']=le.fit_transform(df['ChestPainType'])
df['ExerciseAngina']=le.fit_transform(df['ExerciseAngina'])
df['ST_Slope']=le.fit_transform(df['ST_Slope'])


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int64  
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int64  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int64  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 86.2 KB


In [26]:
df['Oldpeak']=df['Oldpeak'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Age             918 non-null    int64
 1   Sex             918 non-null    int64
 2   ChestPainType   918 non-null    int64
 3   RestingBP       918 non-null    int64
 4   Cholesterol     918 non-null    int64
 5   FastingBS       918 non-null    int64
 6   RestingECG      918 non-null    int64
 7   MaxHR           918 non-null    int64
 8   ExerciseAngina  918 non-null    int64
 9   Oldpeak         918 non-null    int64
 10  ST_Slope        918 non-null    int64
 11  HeartDisease    918 non-null    int64
dtypes: int64(12)
memory usage: 86.2 KB


In [27]:
X=df.drop('HeartDisease',axis=1)
y=df['HeartDisease']

In [30]:
X.shape

(918, 11)

In [34]:
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64

In [35]:
#cross val score

In [48]:
models = [
    LogisticRegression(max_iter=1000),      # Logistic Regression classifier
    SVC(),                     # Support Vector Classifier
    LinearRegression(),        # Linear Regression model
    DecisionTreeClassifier(),  # Decision Tree classifier
    RandomForestClassifier(),  # Random Forest classifier
    GaussianNB()               # Gaussian Naive Bayes classifier
]

In [65]:
def compare_models_cross_val(models, X, y):
    for model in models:
        print("Model:", model.__class__.__name__)
        cv_score = cross_val_score(model, X, y, cv=5)
        mean_accuracy = cv_score.mean() * 100
        mean_accuracy = round(mean_accuracy, 2)
        print("Cross-validation scores:", cv_score)
        print("Mean accuracy:", mean_accuracy, "%")
        print("----------------------------------------")


In [66]:
# Example usage:
compare_models_cross_val(models, X, y)

Model: LogisticRegression
Cross-validation scores: [0.82065217 0.81521739 0.82065217 0.83606557 0.74863388]
Mean accuracy: 80.82 %
----------------------------------------
Model: SVC
Cross-validation scores: [0.61413043 0.78804348 0.69021739 0.71584699 0.61748634]
Mean accuracy: 68.51 %
----------------------------------------
Model: LinearRegression
Cross-validation scores: [0.48611803 0.54028    0.31603936 0.32796628 0.37821494]
Mean accuracy: 40.97 %
----------------------------------------
Model: DecisionTreeClassifier
Cross-validation scores: [0.75       0.75       0.76086957 0.72131148 0.67213115]
Mean accuracy: 73.09 %
----------------------------------------
Model: RandomForestClassifier
Cross-validation scores: [0.88586957 0.80978261 0.83695652 0.83060109 0.75409836]
Mean accuracy: 82.35 %
----------------------------------------
Model: GaussianNB
Cross-validation scores: [0.82065217 0.90217391 0.79891304 0.84699454 0.7704918 ]
Mean accuracy: 82.78 %
--------------------------

In [67]:
rf_classifier = RandomForestClassifier()

# Define hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation score: 0.8354359705393206


In [69]:
best_rfc = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=4,
    min_samples_split=10,
    n_estimators=300
)

# Perform cross-validation
cv_scores = cross_val_score(best_rfc, X, y, cv=5)

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)

# Calculate and print mean accuracy
mean_accuracy = cv_scores.mean() * 100
print("Mean accuracy:", round(mean_accuracy, 2), "%")

Cross-validation scores: [0.89130435 0.85869565 0.8423913  0.83060109 0.75956284]
Mean accuracy: 83.65 %


In [70]:
#train test split

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,random_state=42)

In [78]:
len(X_train)

734

In [79]:
len(X_test)

184

In [80]:
len(y_train)

734

In [81]:
len(y_test)

184

In [82]:
models = [
    LogisticRegression(max_iter=1000),      # Logistic Regression classifier
    SVC(),                     # Support Vector Classifier
    LinearRegression(),        # Linear Regression model
    DecisionTreeClassifier(),  # Decision Tree classifier
    RandomForestClassifier(),  # Random Forest classifier
    GaussianNB()               # Gaussian Naive Bayes classifier
]

In [83]:
def compare_models():
    for model in models:
        model.fit(X_train,y_train)
        print("model name :",model)
        print("train score :",model.score(X_train,y_train))
        print("test score :",model.score(X_test,y_test))
        print("----------------------------------------------------------------------------------------------")




In [84]:
compare_models()

model name : LogisticRegression(max_iter=1000)
train score : 0.8610354223433242
test score : 0.8532608695652174
----------------------------------------------------------------------------------------------
model name : SVC()
train score : 0.7329700272479565
test score : 0.6847826086956522
----------------------------------------------------------------------------------------------
model name : LinearRegression()
train score : 0.5250494867817035
test score : 0.4405303531721336
----------------------------------------------------------------------------------------------
model name : DecisionTreeClassifier()
train score : 1.0
test score : 0.782608695652174
----------------------------------------------------------------------------------------------
model name : RandomForestClassifier()
train score : 1.0
test score : 0.8695652173913043
----------------------------------------------------------------------------------------------
model name : GaussianNB()
train score : 0.862397820163487

In [85]:
#best is rfc

In [86]:
rf_classifier = RandomForestClassifier()

# Define hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on test set
best_rf_classifier = grid_search.best_estimator_
test_accuracy = best_rf_classifier.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best cross-validation score: 0.8678035597800765
Test set accuracy: 0.8804347826086957


In [87]:
best_rf_classifier = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300)

# Fit the classifier to the training data
best_rf_classifier.fit(X_train, y_train)

In [90]:
test_accuracy = best_rf_classifier.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Test set accuracy: 0.8967391304347826


In [91]:
train_accuracy = best_rf_classifier.score(X_train, y_train)
print("Test set accuracy:", train_accuracy)

Test set accuracy: 0.9673024523160763


In [None]:
#use rfc as it is best model for it