In [1]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [36]:
import pandas as pd
import numpy as np
import seaborn as sb
from matplotlib import pyplot as plt
import matplotlib.axes as ax
from matplotlib.animation import FuncAnimation

In [37]:
url = "https://raw.githubusercontent.com/harshul997/Molecular-Drug-Discovery-using-Graphical-Neutral-Networks/refs/heads/main/Week1/Assignment1/Student_Performance.csv"

In [38]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [39]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary

In [40]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes': 1, 'No': 0})
resumetable(df)

Dataset Shape: (10000, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,Hours Studied,int64,0,9,7.0,4.0
1,Previous Scores,int64,0,60,99.0,82.0
2,Extracurricular Activities,int64,0,2,1.0,0.0
3,Sleep Hours,int64,0,6,9.0,4.0
4,Sample Question Papers Practiced,int64,0,10,1.0,2.0
5,Performance Index,float64,0,91,91.0,65.0


In [80]:
class LinearRegression:
  def __init__(self):
    self.parameters = {}

  def forward_propagation(self, train_input):
    m = self.parameters["m"]
    c = self.parameters["c"]
    predictions = np.dot(train_input, m) + c #matrix multiplication
    return predictions

  def cost_function(self, predictions, train_output):
    cost = np.sqrt(np.mean((train_output - predictions) ** 2)) #RMSE
    return cost

  def backward_propagation(self, train_input, train_output, predictions):
    errors = predictions - train_output
    m_deriv = np.dot(train_input.T, errors) / train_input.shape[0]
    c_deriv = np.mean(errors)
    derivatives = {
        "dm": m_deriv,  # Shape: (n_features,)
        "dc": c_deriv   # Scalar
    }
    return derivatives

  def update_parameters(self, derivatives, learning_rate):
    self.parameters["m"] = self.parameters["m"] - learning_rate * derivatives["dm"]
    self.parameters["c"] = self.parameters["c"] - learning_rate * derivatives["dc"]

  def train(self, train_input, train_output, learning_rate, iters):
    train_input = np.array(train_input)
    train_output = np.array(train_output)


    n_features = train_input.shape[1]
    # Initialize random parameters
    self.parameters['m'] = np.random.uniform(-1, 1, size=(n_features,))
    self.parameters['c'] = np.random.uniform(-1, 1)

    # Initialize loss
    self.loss = []

    for i in range(iters):
      #Forward propagation
      predictions = self.forward_propagation(train_input)

      #Calculate function
      cost = self.cost_function(predictions, train_output)

      #Backward propagation
      derivatives = self.backward_propagation(train_input, train_output, predictions)

      #Update parameters
      self.update_parameters(derivatives, learning_rate)

      # Append loss and print
      self.loss.append(cost)
      print("Iteration = {}, Loss = {}".format(i+1, cost))

    return self.parameters, self.loss

In [97]:
linear_reg = LinearRegression()
inputs = ['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced']


parameters, loss = linear_reg.train(df[inputs].values, df["Performance Index"].values, 0.0001, 20)

Iteration = 1, Loss = 123.79996784447388
Iteration = 2, Loss = 59.75577804149345
Iteration = 3, Loss = 29.520325209115768
Iteration = 4, Loss = 15.862489956274178
Iteration = 5, Loss = 10.45294163256802
Iteration = 6, Loss = 8.74898818060757
Iteration = 7, Loss = 8.30659306250104
Iteration = 8, Loss = 8.199665152070745
Iteration = 9, Loss = 8.172932024190866
Iteration = 10, Loss = 8.164798559043112
Iteration = 11, Loss = 8.160948805386601
Iteration = 12, Loss = 8.158085533144044
Iteration = 13, Loss = 8.155450931420512
Iteration = 14, Loss = 8.152870914054041
Iteration = 15, Loss = 8.150305492634342
Iteration = 16, Loss = 8.147745480472551
Iteration = 17, Loss = 8.145188765688241
Iteration = 18, Loss = 8.14263486139342
Iteration = 19, Loss = 8.140083653940064
Iteration = 20, Loss = 8.137535115409351


In [100]:
from scipy.stats import zscore

# Ensure all data is numeric
numeric_df = df.select_dtypes(include=[np.number])

# Calculate z-scores
z_scores = np.abs(zscore(numeric_df))

# Remove rows where any z-score exceeds the threshold
threshold = 3
df_cleaned = df[(z_scores < threshold).all(axis=1)]

linear_reg = LinearRegression()
parameters, loss = linear_reg.train(df_cleaned[inputs].values, df_cleaned["Performance Index"].values, 0.0001, 20)

Iteration = 1, Loss = 13.500289301013208
Iteration = 2, Loss = 9.486990494617567
Iteration = 3, Loss = 8.294275817661692
Iteration = 4, Loss = 7.993774427776526
Iteration = 5, Loss = 7.921594957597469
Iteration = 6, Loss = 7.9033555243698554
Iteration = 7, Loss = 7.897589434773956
Iteration = 8, Loss = 7.894694174574189
Iteration = 9, Loss = 7.892460093238741
Iteration = 10, Loss = 7.890379468137702
Iteration = 11, Loss = 7.8883356752955995
Iteration = 12, Loss = 7.886301926888189
Iteration = 13, Loss = 7.884272068665726
Iteration = 14, Loss = 7.8822446853963815
Iteration = 15, Loss = 7.880219450278574
Iteration = 16, Loss = 7.878196286496388
Iteration = 17, Loss = 7.876175174651069
Iteration = 18, Loss = 7.874156108533151
Iteration = 19, Loss = 7.872139084964504
Iteration = 20, Loss = 7.870124101465317


In [101]:
!pip install opendatasets
import opendatasets as od
od.download('https://www.kaggle.com/datasets/dileep070/heart-disease-prediction-using-logistic-regression/data')

Skipping, found downloaded files in "./heart-disease-prediction-using-logistic-regression" (use force=True to force download)


In [102]:
df1 = pd.read_csv("/content/heart-disease-prediction-using-logistic-regression/framingham.csv")
df1.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [103]:
resumetable(df1)

Dataset Shape: (4238, 16)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,male,int64,0,2,1.0,0.0
1,age,int64,0,39,39.0,46.0
2,education,float64,105,4,4.0,2.0
3,currentSmoker,int64,0,2,0.0,0.0
4,cigsPerDay,float64,29,33,0.0,0.0
5,BPMeds,float64,53,2,0.0,0.0
6,prevalentStroke,int64,0,2,0.0,0.0
7,prevalentHyp,int64,0,2,0.0,0.0
8,diabetes,int64,0,2,0.0,0.0
9,totChol,float64,50,248,195.0,250.0


In [113]:
import scipy.stats as stats
print(df1.skew())

male                0.286135
age                 0.228146
education           0.690033
currentSmoker       0.023606
cigsPerDay          1.247910
BPMeds              5.550010
prevalentStroke    12.909062
prevalentHyp        0.819278
diabetes            5.994378
totChol             0.871422
sysBP               1.145362
diaBP               0.714102
BMI                 0.981974
heartRate           0.644482
glucose             6.213402
TenYearCHD          1.939741
dtype: float64


If skewness is close to 0, the data is roughly symmetric, and you can use the mean.
If skewness is large, especially if itâ€™s > 1 or < -1, the data is skewed, and you may prefer the median.

In [114]:
df2 = df1.fillna(df1.median())

In [140]:
class LogisticRegression:

    def __init__(self, learning_rate = 1, num_iterations = 2000):
       self.learning_rate = learning_rate
       self.num_iterations = num_iterations
       self.w = []
       self.b = 0

    def initialize_weight(self,dim):
      w = np.zeros((dim,1))
      b = 0
      return w, b

    def sigmoid(self,z):
      s = 1/(1 + np.exp(-z))
      return s

    def hypothesis(self,w,X,b):
      H = self.sigmoid(np.dot(w.T,X)+b)
      return H.T

    def cost(self,H,Y,m):
      H = H.flatten()
      Y = Y.flatten()
      cost = -np.sum(Y*np.log(H)+ (1-Y)*np.log(1-H))/m
      cost = np.squeeze(cost)
      return cost

    def cal_gradient(self, w,H,X,Y):
      m = X.shape[1]
      dw = np.dot(X,(H-Y).T)/m
      db = np.sum(H-Y)/m
      grads = {"dw": dw,
                "db": db}
      return grads

    def gradient_position(self, w, b, X, Y):
      m = X.shape[1]
      H = self.hypothesis(w,X,b)         # compute activation
      cost = self.cost(H,Y,m)               # compute cost
      grads = self.cal_gradient(w, H, X, Y) # compute gradient
      return grads, cost

    def gradient_descent(self, w, b, X, Y, print_cost = False):
      costs = []
      for i in range(self.num_iterations):
      # Cost and gradient calculation
          grads, cost = self.gradient_position(w,b,X,Y)


      # Retrieve derivatives from grads
      dw = grads['dw']
      db = grads['db']

      # update rule
      w = w - (self.learning_rate * dw)
      b = b - (self.learning_rate * db)

      # Record the costs
      if i % 100 == 0:
          costs.append(cost)

      # Print the cost every 100 training iterations
      if print_cost and i % 100 == 0:
            print ('Cost after iteration %i: %f' %(i, cost))


      params = {'w': w,
                'b': b}

      grads = {'dw': dw,
              'db': db}

      return params, grads, costs

    def predict(self,X):
      X = np.array(X)
      m = X.shape[1]

      Y_prediction = np.zeros((1,m))

      w = self.w.reshape(X.shape[0], 1)
      b = self.b
      # Compute vector "H"
      H = self.hypothesis(w, X, b)

      for i in range(H.shape[1]):
      # Convert probabilities H[0,i] to actual predictions p[0,i]
          if H[0,i] >= 0.5:
              Y_prediction[0,i] = 1
          else:
              Y_prediction[0,i] = 0

      return Y_prediction

    def train(self, X_train, Y_train, X_test, Y_test, print_cost = False):
      dim = np.shape(X_train)[0]
      w, b = self.initialize_weight(dim)

      parameters, grads, costs = self.gradient_descent(w, b, X_train, Y_train, print_cost = False)

      self.w = parameters["w"]
      self.b = parameters["b"]

      Y_prediction_test = self.predict(X_test)
      Y_prediction_train = self.predict(X_train)

      train_score = 100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100
      test_score = 100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100
      print('train accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
      print('test accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
      d = {
            "costs": costs,
            "Y_prediction_test": Y_prediction_test,
            "Y_prediction_train": Y_prediction_train,
            "w": self.w,
            "b": self.b,
            "learning_rate": self.learning_rate,
            "num_iterations": self.num_iterations,
            "train accuracy": train_score,
            "test accuracy": test_score
        }

      return d

In [157]:
class LogisticRegression:

    def __init__(self, learning_rate = 1, num_iterations = 2000):
       self.learning_rate = learning_rate
       self.num_iterations = num_iterations
       self.w = []
       self.b = 0

    def initialize_weight(self,dim):
      w = np.zeros((dim,1))
      b = 0
      return w, b

    def sigmoid(self,z):
      s = 1/(1 + np.exp(-z))
      return s

    def hypothesis(self,w,X,b):
      H = self.sigmoid(np.dot(X, w)+b)
      return H

    def cost(self,H,Y,m):
      H = H.flatten()
      Y = Y.reshape(-1, 1)
      cost = -np.sum(Y*np.log(H)+ (1-Y)*np.log(1-H))/m
      cost = np.squeeze(cost)
      return cost

    def cal_gradient(self, w,H,X,Y):
      m = X.shape[0]
      Y = Y.reshape(-1, 1)
      dw = np.dot(X.T,(H-Y))/m
      db = np.sum(H-Y)/m
      grads = {"dw": dw,
                "db": db}
      return grads

    def gradient_position(self, w, b, X, Y):
      m = X.shape[1]
      H = self.hypothesis(w,X,b)         # compute activation
      cost = self.cost(H,Y,m)               # compute cost
      grads = self.cal_gradient(w, H, X, Y) # compute gradient
      return grads, cost

    def gradient_descent(self, w, b, X, Y, print_cost = False):
      costs = []
      for i in range(self.num_iterations):
      # Cost and gradient calculation
          grads, cost = self.gradient_position(w,b,X,Y)


      # Retrieve derivatives from grads
      dw = grads['dw']
      db = grads['db']

      # update rule
      w = w - (self.learning_rate * dw)
      b = b - (self.learning_rate * db)

      # Record the costs
      if i % 100 == 0:
          costs.append(cost)

      # Print the cost every 100 training iterations
      if print_cost and i % 100 == 0:
            print ('Cost after iteration %i: %f' %(i, cost))


      params = {'w': w,
                'b': b}

      grads = {'dw': dw,
              'db': db}

      return params, grads, costs

    def predict(self,X):
      X = np.array(X)
      m = X.shape[1]

      Y_prediction = np.zeros((1,m))

      w = self.w.reshape(X.shape[0], 1)
      b = self.b
      # Compute vector "H"
      H = self.hypothesis(w, X, b)

      for i in range(H.shape[1]):
      # Convert probabilities H[0,i] to actual predictions p[0,i]
          if H[0,i] >= 0.5:
              Y_prediction[0,i] = 1
          else:
              Y_prediction[0,i] = 0

      return Y_prediction

    def train(self, X_train, Y_train, X_test, Y_test, print_cost=False):
      # Check the shapes of X_train and Y_train
      print("Shape of X_train:", X_train.shape)
      print("Shape of Y_train:", Y_train.shape)

      dim = np.shape(X_train)[1]
      w, b = self.initialize_weight(dim)

      # Training loop
      parameters, grads, costs = self.gradient_descent(w, b, X_train, Y_train, print_cost=False)

      # Make predictions on the train and test set
      Y_prediction_test = self.predict(X_test)
      Y_prediction_train = self.predict(X_train)

      # Debugging shapes
      print("Shape of Y_train:", Y_train.shape)
      print("Shape of Y_prediction_train:", Y_prediction_train.shape)
      print("Shape of Y_test:", Y_test.shape)
      print("Shape of Y_prediction_test:", Y_prediction_test.shape)

      # Compute train and test accuracy
      train_score = 100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100
      test_score = 100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100
      print('train accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
      print('test accuracy: {} %'.format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

      d = {
          "costs": costs,
          "Y_prediction_test": Y_prediction_test,
          "Y_prediction_train": Y_prediction_train,
          "w": self.w,
          "b": self.b,
          "learning_rate": self.learning_rate,
          "num_iterations": self.num_iterations,
          "train accuracy": train_score,
          "test accuracy": test_score
      }

      return d

In [159]:
class LogisticRegression:

    def __init__(self, learning_rate = 1, num_iterations = 2000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.w = []
        self.b = 0

    def initialize_weight(self, dim):
        w = np.zeros((dim, 1))
        b = 0
        return w, b

    def sigmoid(self, z):
        return 1/(1 + np.exp(-z))

    def hypothesis(self, w, X, b):
        z = np.dot(X, w) + b
        return self.sigmoid(z)

    def cost(self, H, Y, m):
        epsilon = 1e-15
        H = np.clip(H, epsilon, 1 - epsilon)
        Y = Y.reshape(-1, 1)
        return -np.mean(Y * np.log(H) + (1 - Y) * np.log(1 - H))

    def cal_gradient(self, w, H, X, Y):
        m = X.shape[0]
        Y = Y.reshape(-1, 1)
        dw = np.dot(X.T, (H - Y)) / m
        db = np.mean(H - Y)
        return {"dw": dw, "db": db}

    def gradient_position(self, w, b, X, Y):
        H = self.hypothesis(w, X, b)
        cost = self.cost(H, Y, X.shape[0])
        grads = self.cal_gradient(w, H, X, Y)
        return grads, cost

    def gradient_descent(self, w, b, X, Y, print_cost = False):
        costs = []

        print("Starting gradient descent...")
        print("Iteration | Cost")
        print("-" * 25)

        for i in range(self.num_iterations):
            # Cost and gradient calculation
            grads, cost = self.gradient_position(w, b, X, Y)

            # Retrieve derivatives from grads
            dw = grads['dw']
            db = grads['db']

            # update rule
            w = w - (self.learning_rate * dw)
            b = b - (self.learning_rate * db)

            # Print iteration and cost for every iteration
            print(f"Iteration {i+1:4d} | Cost: {cost:.6f}")

            # Record all costs
            costs.append(cost)

        print("-" * 25)
        print(f"Final cost: {cost:.6f}")

        params = {'w': w, 'b': b}
        grads = {'dw': dw, 'db': db}

        return params, grads, costs

    def predict(self, X):
        H = self.hypothesis(self.w, X, self.b)
        return (H >= 0.5).astype(int)

    def train(self, X_train, Y_train, learning_rate, num_iterations, print_cost=False):
        print(f"\nTraining with learning rate: {learning_rate}")
        print(f"Number of iterations: {num_iterations}")
        print(f"Training data shape: {X_train.shape}")
        print("-" * 50)

        # Set hyperparameters
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

        # Initialize parameters
        self.w, self.b = self.initialize_weight(X_train.shape[1])

        # Training loop
        parameters, grads, costs = self.gradient_descent(self.w, self.b, X_train, Y_train, print_cost)

        # Update parameters
        self.w = parameters['w']
        self.b = parameters['b']

        print("\nTraining completed!")
        print(f"Initial cost: {costs[0]:.6f}")
        print(f"Final cost: {costs[-1]:.6f}")
        print(f"Cost reduction: {costs[0] - costs[-1]:.6f}")
        print("-" * 50)

        return parameters, costs

In [160]:
linear_reg = LogisticRegression()
inputs = ['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose']


parameters, loss = linear_reg.train(df2[inputs].values, df2["TenYearCHD"].values, 0.0001, 20)


Training with learning rate: 0.0001
Number of iterations: 20
Training data shape: (4238, 15)
--------------------------------------------------
Starting gradient descent...
Iteration | Cost
-------------------------
Iteration    1 | Cost: 0.693147
Iteration    2 | Cost: 0.561565
Iteration    3 | Cost: 0.453547
Iteration    4 | Cost: 0.441476
Iteration    5 | Cost: 0.441285
Iteration    6 | Cost: 0.441182
Iteration    7 | Cost: 0.441087
Iteration    8 | Cost: 0.440993
Iteration    9 | Cost: 0.440900
Iteration   10 | Cost: 0.440807
Iteration   11 | Cost: 0.440715
Iteration   12 | Cost: 0.440623
Iteration   13 | Cost: 0.440532
Iteration   14 | Cost: 0.440441
Iteration   15 | Cost: 0.440351
Iteration   16 | Cost: 0.440261
Iteration   17 | Cost: 0.440172
Iteration   18 | Cost: 0.440084
Iteration   19 | Cost: 0.439995
Iteration   20 | Cost: 0.439908
-------------------------
Final cost: 0.439908

Training completed!
Initial cost: 0.693147
Final cost: 0.439908
Cost reduction: 0.253239
------