In [None]:
!pip install opendatasets

## Importing libs and downloading Dataset

In [17]:
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd
import numpy as np

def train_test_split(X,y,test_size= 0.2,random_state=None,shuffle: bool = True):

    n = len(X)
    indices = np.arange(n)

    if shuffle:
        rng = np.random.default_rng(random_state)
        rng.shuffle(indices)

    test_count = int(n * test_size)
    test_idx = indices[:test_count]
    train_idx = indices[test_count:]

    X_train = X.iloc[train_idx].reset_index(drop=True)
    X_test  = X.iloc[test_idx].reset_index(drop=True)
    y_train = y.iloc[train_idx].reset_index(drop=True)
    y_test  = y.iloc[test_idx].reset_index(drop=True)

    return X_train, X_test, y_train, y_test


In [None]:
od.download("https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset")

# EXPLORING DATASET

In [5]:
df = pd.read_csv("/content/heart-disease-dataset/heart.csv")

In [6]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [9]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [13]:
sc = StandardScaler()
sc.fit(df.drop("target",axis=1))
X_scale = sc.transform(df.drop("target",axis=1))
y = df["target"]

In [46]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,526
0,499


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.33, random_state=42)

In [None]:
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
X_std[X_std == 0] = 1

# SUPERVISED TASK

## Function Defintions

In [None]:
def sigmoid(z):
  return 1/(1+np.exp(-z))   

In [None]:
def gradient_descent(X,y,params,lr = 0.05,iter=1000):
  X = (X-X_mean)/X_std
  X = np.c_[np.ones((X.shape[0], 1)), X]
  m = X.shape[0]
  cost_history = []
  for i in range(iter):
    pred = sigmoid(X@params)
    error = pred-y
    grad = (X.T@(error))/m
    cost = (-1/m)*np.sum(y*np.log(pred+1e-9)+(1-y)*np.log(1-pred+1e-9))
    cost_history.append(cost)
    params = params - lr*(grad)
  return params,cost_history


In [None]:
def predict(W,X):
  X = (X-X_mean)/X_std
  X = np.c_[np.ones((X.shape[0], 1)), X]
  pred = X@W
  return (pred>=0.5).astype(int)

## Training Custom Logisitic Regression

In [None]:
W,costs = gradient_descent(X_train,y_train,np.zeros((X_train.shape[1]+1,1)),lr=0.05,iter=1000)

## Training Sklearn's logistic

In [None]:
y_train = y_train.reshape(-1)

In [None]:
model  = LogisticRegression(max_iter=5000).fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

## Evaluation of Models

#### Custom

In [None]:
print(precision_score(y_test,predict(W,X_test)))
print(accuracy_score(y_test,predict(W,X_test)))
print(recall_score(y_test,predict(W,X_test)))
print(f1_score(y_test,predict(W,X_test)))
print(confusion_matrix(y_test,predict(W,X_test)))

#### Sklearn

In [None]:
print(precision_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(f1_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

# UNSUPERVISED TASK

In [None]:
import numpy as np

def kmeans(X, k, max_iter=100, tol=1e-4, random_state=None):
    rng = np.random.default_rng(random_state)
    X = np.asarray(X)
    centroids = X[rng.choice(len(X), k, replace=False)]
    labels = None
    for _ in range(max_iter):
        distances = np.linalg.norm(X[:, None] - centroids[None, :], axis=2)
        labels = distances.argmin(axis=1)
        new_centroids = np.array([
            X[labels == i].mean(axis=0) for i in range(k)
        ])
        if np.linalg.norm(new_centroids - centroids) < tol:
            break
        centroids = new_centroids

    return labels, centroids


In [None]:
kmeans(X_train,5)