In [3]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

df = pd.read_csv("data.csv")
profile = ProfileReport(df, title="My Data Report")



In [4]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [5]:
print("Columns in dataframe:")
print(df.columns.tolist())
print(f"\nShape: {df.shape}")
print(f"\nFirst few rows:")

Columns in dataframe:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num       ']

Shape: (293, 14)

First few rows:


In [6]:
df.columns = df.columns.str.strip()

# Replace '?' with NaN across entire dataframe
df = df.replace('?', 0)

# Convert numeric columns to proper numeric type
numeric_cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values with median or 0
df = df.fillna(0)

target_name = "num"

x = df.drop(columns="num")
y = df[target_name]
x.head()
df["oldpeak"].value_counts()

oldpeak
0.0    188
1.0     41
2.0     31
1.5     16
3.0      9
2.5      3
0.5      2
0.8      1
4.0      1
5.0      1
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print("accuracy_score:",accuracy_score(y_test,y_pred))
print("\n F1:\n",f1_score(y_test,y_pred))
print("\n Precision score is:\n",precision_score(y_test,y_pred))
print("\n Recall score is:\n",recall_score(y_test,y_pred))
print( "confusion matrix is:\n",confusion_matrix(y_test,y_pred))

accuracy_score: 0.6610169491525424

 F1:
 0.4117647058823529

 Precision score is:
 0.4375

 Recall score is:
 0.3888888888888889
confusion matrix is:
 [[32  9]
 [11  7]]


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Define the model
lr = LogisticRegression()

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300],

}

# Apply GridSearchCV
grid_search = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',  # You can also use 'f1', 'precision', 'recall'
    n_jobs=-1            # Use all cores for faster computation
)

# Fit the model
grid_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best estimator
best_lr = grid_search.best_estimator_

# Predictions with best model
lr_pred = best_lr.predict(x_test)

# Evaluation metrics
print("accuracy_score:", accuracy_score(y_test, lr_pred))
print("\nF1:", f1_score(y_test, lr_pred))
print("\nPrecision score:", precision_score(y_test, lr_pred))
print("\nRecall score:", recall_score(y_test, lr_pred))
print( "confusion matrix is:\n",confusion_matrix(y_test,lr_pred))

Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy_score: 0.7966101694915254

F1: 0.7272727272727273

Precision score: 0.6153846153846154

Recall score: 0.8888888888888888
confusion matrix is:
 [[31 10]
 [ 2 16]]




## We observe :
### The accuracy of the modal is 72% 
### The confusion matrix mostly predict correctly
