In [1]:
import pandas as pd
data = pd.read_csv('student.csv')

In [2]:
print("First 5 rows of the dataset:")
print(data.head())

print("\nLast 5 rows of the dataset:")
print(data.tail())

First 5 rows of the dataset:
   Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62

Last 5 rows of the dataset:
     Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72


In [3]:
print("\nDataset Info:")
print(data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB
None


In [4]:
print("\nDataset Description:")
print(data.describe())


Dataset Description:
              Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.872000    68.616000
std      15.085008    14.657027    15.241287
min      13.000000    19.000000    14.000000
25%      58.000000    60.750000    58.000000
50%      68.000000    70.000000    69.500000
75%      78.000000    81.000000    79.000000
max     100.000000   100.000000   100.000000


In [5]:
print("\nMissing Values Count:")
print(data.isnull().sum())


Missing Values Count:
Math       0
Reading    0
Writing    0
dtype: int64


### Splitting the data into Feature (x) and (y)

In [12]:
X = data[['Math', 'Reading']].values  
Y = data['Writing'].values  

#selecting the first 5 rows of the feature matrix and target vector using slicing notation. 
print("Feature Matrix (X):\n", X[0:5])  
print("\nTarget Vector (Y):\n", Y[0:5]) 

Feature Matrix (X):
 [[48 68]
 [62 81]
 [79 80]
 [76 83]
 [59 64]]

Target Vector (Y):
 [63 72 78 79 62]


>Usually, Y = W^TX + B
>  Since we assume that there is no bias or intercept, Y becomes:
> Y = W^TX

In [21]:
import numpy as np

print("Length of X: ", len(X))
print("\n")

np.random.seed(30) 
indices = np.random.permutation(len(X))  

train_size = int(0.8 * len(X))
test_size = len(X) - train_size;

X_train = X[indices[:train_size]]
X_test = X[indices[train_size:]]
Y_train = Y[indices[:train_size]]
Y_test = Y[indices[train_size:]]

print("Training Features (X) Shape:", X_train.shape)
print("Testing Features (X) Shape:", X_test.shape)
print("Training Labels (Y) Shape:", Y_train.shape)
print("Testing Labels (Y) Shape:", Y_test.shape)
print("\nTrain Size:", train_size)
print("Test Size:", test_size)

Length of X:  1000


Training Features (X) Shape: (800, 2)
Testing Features (X) Shape: (200, 2)
Training Labels (Y) Shape: (800,)
Testing Labels (Y) Shape: (200,)

Train Size: 800
Test Size: 200


In [23]:
import numpy as np

def cost_function(X, Y, W):
    """
    Parameters:
    This function computes the Mean Squared Error (MSE) cost for Linear Regression.
    
    Input Parameters:
    X: Feature Matrix (n x d) where n is the number of data points and d is the number of features
    Y: Target Vector (n,)
    W: Weight Vector (d,)
    
    Output Parameters:
    cost: Accumulated mean squared error (scalar)
    """
    
    y_pred = np.dot(X, W)
    
    n = len(Y)  
    cost = (1 / (2 * n)) * np.sum((y_pred - Y) ** 2)
    
    return cost

# with 3 data points and 2 features
X_test = np.array([[1, 2], [3, 4], [5, 6]])

#target values for the corresponding data points
Y_test = np.array([3, 7, 11])

#weight vectot for the features
W_test = np.array([1, 1])

cost = cost_function(X_test, Y_test, W_test)

if cost == 0:
    print("Proceed Further")
else:
    print("Something went wrong: Reimplement the cost function")
    
print("Cost function output:", cost)

Proceed Further
Cost function output: 0.0


In [7]:
import numpy as np

def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize the parameters of a linear regression model.
    
    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Initial guess for parameters (n x 1).
    alpha (float): Learning rate.
    iterations (int): Number of iterations for gradient descent.
    
    Returns:
    tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values.
        W_update (numpy.ndarray): Updated parameters (n x 1).
        cost_history (list): History of cost values over iterations.
    """
    
    cost_history = [0] * iterations
    
    m = len(Y)
    
    for iteration in range(iterations):
        # Step 1: Hypothesis Values: Y_pred = W.T * X
        Y_pred = np.dot(X, W)
        
        # Step 2: Difference between Hypothesis and Actual Y: loss = Y_pred - Y
        loss = Y_pred - Y
        
        # Step 3: Gradient Calculation: dw = (1/m) * X.T * loss
        dw = (1 / m) * np.dot(X.T, loss)
        
        # Step 4: Update Parameters using Gradient Descent: W = W - alpha * dw
        W_update = W - alpha * dw
        
        # Step 5: Calculate new cost value
        cost = cost_function(X, Y, W_update)
        cost_history[iteration] = cost
        
        # Update W to the new W after applying the update rule
        W = W_update
    
    return W_update, cost_history


def cost_function(X, Y, W):
    """
    Compute the Mean Squared Error (MSE) cost for the given parameters W.
    
    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Parameters vector (n x 1).
    
    Returns:
    cost: The cost (scalar) for the given parameters.
    """
    Y_pred = np.dot(X, W)
    m = len(Y)
    cost = (1 / (2 * m)) * np.sum((Y_pred - Y) ** 2)
    return cost


# np.random.seed(0)  # For reproducibility
# X = np.random.rand(100, 3)  # 100 samples, 3 features
# Y = np.random.rand(100)  # 100 target values
# W = np.random.rand(3)  # Initial guess for parameters

# # Set hyperparameters
# alpha = 0.01  # Learning rate
# iterations = 1000  # Number of iterations

# # Testing the gradient_descent function
# final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)

# print("Final Parameters:", final_params)
# print("Cost History:", cost_history[-10:])

In [9]:
def rmse(Y, Y_pred):
    """
    This function calculates the Root Mean Squared Error (RMSE).
    
    Arguments:
    Y -- Array of actual (target) dependent variables.
    Y_pred -- Array of predicted dependent variables.
    
    Returns:
    rmse -- Root Mean Square Error.
    """
    # Calculate the squared differences between actual and predicted values
    squared_errors = (Y - Y_pred) ** 2
    
    # Calculate the mean of squared errors
    mean_squared_error = np.mean(squared_errors)
    
    # Return the square root of the mean squared error (RMSE)
    rmse = np.sqrt(mean_squared_error)
    
    return rmse


In [10]:
def r2(Y, Y_pred):
    """
    This function calculates the R Squared (R²) error.
    
    Arguments:
    Y -- Array of actual (target) dependent variables.
    Y_pred -- Array of predicted dependent variables.
    
    Returns:
    r2 -- R Squared Error.
    """
    # Calculate the mean of the actual values
    mean_y = np.mean(Y)
    
    # Calculate the Total Sum of Squares (SST)
    ss_tot = np.sum((Y - mean_y) ** 2)
    
    # Calculate the Sum of Squared Residuals (SSR)
    ss_res = np.sum((Y - Y_pred) ** 2)
    
    # Calculate the R Squared (R²) error
    r2 = 1 - (ss_res / ss_tot)
    
    return r2


In [12]:
final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)

# Calculate predictions using the optimized parameters
Y_pred = X.dot(final_params)  # Assuming a linear model without bias

# Calculate RMSE
rmse_value = rmse(Y, Y_pred)
# print(f"RMSE: {rmse_value}")

# Calculate R²
r2_value = r2(Y, Y_pred)
# print(f"R²: {r2_value}")

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Function to compute RMSE (Root Mean Squared Error)
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

# Function to compute R-Squared
def r2(Y, Y_pred):
    ss_total = np.sum((Y - np.mean(Y)) ** 2)
    ss_residual = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_residual / ss_total)

# Gradient Descent Function
def gradient_descent(X, Y, W, alpha, iterations):
    m = len(Y)
    cost_history = []
    
    for _ in range(iterations):
        Y_pred = np.dot(X, W)
        cost = (1 / (2 * m)) * np.sum((Y_pred - Y) ** 2)  # Mean Squared Error
        cost_history.append(cost)
        
        # Compute gradient
        dw = (1 / m) * np.dot(X.T, (Y_pred - Y))
        W -= alpha * dw  # Update weights
        
    return W, cost_history

def main():
    # Step 1: Load the dataset
    data = pd.read_csv('student.csv')
    
    # Step 2: Split the data into features (X) and target (Y)
    X = data[['Math', 'Reading']].values  # Features: Math and Reading marks
    Y = data['Writing'].values  # Target: Writing marks
    
    # Step 3: Split the data into training and test sets (80% train, 20% test)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Step 4: Initialize weights (W) to zeros, learning rate, and number of iterations
    W = np.zeros(X_train.shape[1])  # Initialize weights
    alpha = 0.00001  # Learning rate
    iterations = 1000  # Number of iterations for gradient descent
    
    # Step 5: Perform Gradient Descent
    W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)
    
    # Step 6: Make predictions on the test set
    Y_pred = np.dot(X_test, W_optimal)
    
    # Step 7: Evaluate the model using RMSE and R-Squared
    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)
    
    print("Final Weights:", W_optimal)
    print("Cost History (First 10 iterations):", cost_history[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)


if __name__ == "__main__":
    main()


Final Weights: [0.34811659 0.64614558]
Cost History (First 10 iterations): [np.float64(2471.69875), np.float64(2013.165570783755), np.float64(1640.286832599692), np.float64(1337.0619994901588), np.float64(1090.4794892850578), np.float64(889.9583270083234), np.float64(726.8940993009545), np.float64(594.2897260808594), np.float64(486.4552052951635), np.float64(398.7634463599484)]
RMSE on Test Set: 5.2798239764188635
R-Squared on Test Set: 0.8886354462786421


When the learning rate alpha is set to alpha = 0.00001 then the value of R-Squared on test set becomes approximately 0.88, which is closer to 1. 
We can see that for the first 10 iterations, the cost gradually decreases to 398.76 starting from 2471.69. Gradient descent seems to be working as expected. A well-chose alpha, the value of learning rate might have contributed to its nature. 