In [64]:
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/Datasets/student.csv")

print(data.head())
print(data.tail())
print(data.info())
print(data.describe())

X = data[['Math', 'Reading']].values
Y = data['Writing'].values


   Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62
     Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB
None
              Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.872000    68.616000
std      15.085008    14.657027    15.241287
min      13.000000    19.000000    14.000000
25%      58.000000    60.750000    58.000000
50%      68.000000    70.000000    69.500000
75%      78.000000    81.000000 

In [65]:
import numpy as np

X = X.T          # shape: (d, n)
Y = Y            # shape: (n,)
W = np.zeros(X.shape[0])  # shape: (d,)


In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X.T, Y, test_size=0.2, random_state=42
)


In [67]:
#Define the cost function
import numpy as np

def cost_function(X, Y, W):
  """Parameters:
This function finds the Mean Square Error.
Input Parameters:
X: Feature Matrix
Y: Target Matrix
W: Weight Matrix
OUtpit Parameters:
cost: accumulated mean square error.
"""
  m = len(Y)
  y_pred = X.dot(W)
  cost = (1/m) * np.sum((y_pred - Y)**2)
  return cost





In [68]:
# test case
X_test = np.array([[1, 2], [3, 4], [5, 6]])
Y_test = np.array([3, 7, 11])
W_test = np.array([1, 1])

cost = cost_function(X_test, Y_test, W_test)

if cost == 0:
    print("Proceed further")
else:
    print("Something went wrong: Reimplement cost function")

print("Cost function output:", cost)


Proceed further
Cost function output: 0.0


In [69]:
def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize the parameters of a linear regression model.
    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Initial guess for parameters (n x 1).
    alpha (float): Learning rate.
    iterations (int): Number of iterations for gradient descent.
    Returns:
    tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values.
    W_update (numpy.ndarray): Updated parameters (n x 1).
    cost_history (list): History of cost values over iterations.
    """

    # Initialize cost history
    cost_history = [0] * iterations

    # Number of samples
    m = len(Y)

    for iteration in range(iterations):

        # Step 1: Hypothesis Values
        Y_pred = np.dot(X, W)

        # Step 2: Difference between Hypothesis and Actual Y
        loss = Y_pred - Y

        # Step 3: Gradient Calculation
        dw = (1 / m) * np.dot(X.T, loss)

        # Step 4: Updating Values of W using Gradient
        W_update = W - alpha * dw

        # Step 5: New Cost Value
        cost = cost_function(X, Y, W_update)
        cost_history[iteration] = cost

        # IMPORTANT: update W for next iteration
        W = W_update

    return W_update, cost_history


In [70]:
import numpy as np

def rmse(Y, Y_pred):
    """
    This Function calculates the Root Mean Squared Error.

    Input Arguments:
    Y: Array of actual (Target) dependent variables.
    Y_pred: Array of predicted dependent variables.

    Output:
    rmse: Root Mean Squared Error
    """
    rmse = np.sqrt(np.mean((Y - Y_pred) ** 2))
    return rmse


In [71]:
import numpy as np

def r2(Y, Y_pred):
    """
    This Function calculates the R Squared Error.

    Input:
    Y: Array of actual (Target) dependent variables.
    Y_pred: Array of predicted dependent variables.

    Output:
    r2: R Squared Error
    """
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2


In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def main():
    # Step 1: Load the dataset
    data = pd.read_csv('/content/drive/MyDrive/Datasets/student.csv')

    # Step 2: Split the data into features (X) and target (Y)
    X = data[['Math', 'Reading']].values  # Features: Math and Reading marks
    Y = data['Writing'].values            # Target: Writing marks

    # Step 3: Split the data into training and test sets (80% train, 20% test)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Step 4: Initialize weights (W) to zeros, learning rate and number of iterations
    W = np.zeros(X_train.shape[1])  # Initialize weights
    alpha = 0.00001      # Learning rate
    iterations = 1000    # Number of iterations for gradient descent

    # Step 5: Perform Gradient Descent
    W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)

    # Step 6: Make predictions on the test set
    Y_pred = np.dot(X_test, W_optimal)

    # Step 7: Evaluate the model using RMSE and R-Squared
    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    # Step 8: Output the results
    print("Final Weights:", W_optimal)
    print("Cost History (First 10 iterations):", cost_history[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)

# Execute the main function
if __name__ == "__main__":
    main()


Final Weights: [0.34811659 0.64614558]
Cost History (First 10 iterations): [np.float64(4026.33114156751), np.float64(3280.573665199384), np.float64(2674.1239989803175), np.float64(2180.9589785701155), np.float64(1779.9166540166468), np.float64(1453.788198601909), np.float64(1188.5794521617188), np.float64(972.910410590327), np.float64(797.5268927198968), np.float64(654.9034294649376)]
RMSE on Test Set: 5.2798239764188635
R-Squared on Test Set: 0.8886354462786421
