In [1]:
#To-Do 1: Data Understanding, Analysis & Preparation

"""
The first rule of ML you have to understand its the data.We read in the data with Pandas, take a quick look at the first and last few rows to have an idea of what the data looks like, and we also check metadata such as
data types or missing values. Descriptive statistics assist with getting intuition for the distribution, central tendency, and spread of each feature. Finally, we split features (X) and target (Y) according to the problem statement.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
# Load dataset
data = pd.read_csv("/content/drive/MyDrive/concept and technology of AI/student.csv")

# Top 5 rows
print("Top 5 rows:")
display(data.head())

# Bottom 5 rows
print("Bottom 5 rows:")
display(data.tail())

# Dataset information
print("\nDataset Info:")
data.info()

# Descriptive statistics
print("\nDescriptive Statistics:")
display(data.describe())


Top 5 rows:


Unnamed: 0,Math,Reading,Writing
0,48,68,63
1,62,81,72
2,79,80,78
3,76,83,79
4,59,64,62


Bottom 5 rows:


Unnamed: 0,Math,Reading,Writing
995,72,74,70
996,73,86,90
997,89,87,94
998,83,82,78
999,66,66,72



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB

Descriptive Statistics:


Unnamed: 0,Math,Reading,Writing
count,1000.0,1000.0,1000.0
mean,67.29,69.872,68.616
std,15.085008,14.657027,15.241287
min,13.0,19.0,14.0
25%,58.0,60.75,58.0
50%,68.0,70.0,69.5
75%,78.0,81.0,79.0
max,100.0,100.0,100.0


In [4]:
X = data[['Math', 'Reading']].values
Y = data['Writing'].values


In [5]:
#To-Do 2: Design Matrix without Bias

'''
In this worksheet, we are considering a linear regression model that does not include any bias term (intercept), which simplifies both the computations as well as the structure
of the model. Thus, the design matrix X is made up of only feature values (Math and Reading), where each row represents a student, and each column an attribute. The Writing grades
are then included in Y and these serve as the target values for our model to predict, with each element in Y mapped one-to-one to the same row in X. This oversimplification makes
the model learn the direct connection between the input and target variables uncoupled from some intercept term.
'''
# Feature matrix shape: (n_samples, n_features)
print("X shape:", X.shape)

# Target vector shape
print("Y shape:", Y.shape)


X shape: (1000, 2)
Y shape: (1000,)


In [7]:
#To-Do 3: Train–Test Split

'''
The performance of the model is evaluated on test data sets, keeping in mind that it does not see these image patterns before. The training set is utilized for learning the input
-output relationships and patterns, whereas the testing set is never seen by the model during training and solely used to measure the performance of a trained model on new (or future)
data. The 80–20 split is a well known rule of thumb used in practice; this split gives enough data to train the model but lets there be a largeish chunk to evaluate if there were
points like overfitting or under fitting.
'''
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [8]:
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples: 800
Testing samples: 200


In [17]:
#To-Do 4: Cost Function (Mean Squared Error)

'''
The cost function measures how well the model fits the data. We use Mean Squared Error (MSE), which calculates the average of squared differences between predicted and actual
values. Minimizing this cost leads to better model performance.
'''
def cost_function(X, Y, W):
    """
    Computes Mean Squared Error cost
    """
    n = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1 / (2 * n)) * np.sum((Y_pred - Y) ** 2)
    return cost



In [10]:
#To-Do 5: Cost Function Test Case

'''
A known test case is used to verify correctness. When predictions perfectly match actual values, the cost must be zero. This confirms the correctness of our cost function.
'''
X_test_case = np.array([[1, 2], [3, 4], [5, 6]])
Y_test_case = np.array([3, 7, 11])
W_test_case = np.array([1, 1])

cost = cost_function(X_test_case, Y_test_case, W_test_case)
print("Cost:", cost)



Cost: 0.0


In [11]:
#To-Do 6: Gradient Descent Implementation

'''
Gradient Descent is an optimization algorithm used to minimize the cost function. It updates the model parameters iteratively by moving them in the direction of the negative gradient. With each iteration, the model predictions
'''

def gradient_descent(X, Y, W, alpha, iterations):
    m = len(Y)
    cost_history = []

    for _ in range(iterations):
        Y_pred = np.dot(X, W)
        loss = Y_pred - Y
        dw = (1 / m) * np.dot(X.T, loss)
        W = W - alpha * dw
        cost_history.append(cost_function(X, Y, W))

    return W, cost_history


In [12]:
#To-Do 7: Gradient Descent Testing

'''
Random data is generated to ensure the gradient descent function works correctly. A decreasing cost history indicates successful learning.
'''

np.random.seed(0)

X_rand = np.random.rand(100, 3)
Y_rand = np.random.rand(100)
W_rand = np.random.rand(3)

final_W, cost_history = gradient_descent(X_rand, Y_rand, W_rand, 0.01, 1000)

print("Final Weights:", final_W)
print("Final Cost:", cost_history[-1])


Final Weights: [0.20551667 0.54295081 0.10388027]
Final Cost: 0.05435492255484332


In [19]:
#To-Do 8: RMSE Evaluation Metric

'''
Root Mean Squared Error (RMSE) measures the average prediction error in the same units as the target variable. Lower RMSE indicates better accuracy.
'''
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))


In [18]:
#To-Do 9: R² (Coefficient of Determination)

'''
R² measures how much variance in the target variable is explained by the model. A value closer to 1 indicates better performance.
'''
def r2(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)





In [20]:
#To-Do 10: Main Function (Full Workflow)

'''
The main function integrates data loading, training, prediction, and evaluation. This provides a clean and reusable pipeline.
'''

def main():
    data = pd.read_csv("/content/drive/MyDrive/concept and technology of AI/student.csv")

    X = data[['Math', 'Reading']].values
    Y = data['Writing'].values

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    W = np.zeros(X_train.shape[1])
    W_opt, cost_history = gradient_descent(X_train, Y_train, W, 0.00001, 1000)

    Y_pred = np.dot(X_test, W_opt)

    print("Final Weights:", W_opt)
    print("RMSE:", rmse(Y_test, Y_pred))
    print("R²:", r2(Y_test, Y_pred))

if __name__ == "__main__":
    main()


Final Weights: [0.34811659 0.64614558]
RMSE: 5.2798239764188635
R²: 0.8886354462786421


In [None]:
#To-Do 11: Findings (Elaborated Answer)

'''
The model performance is acceptable because both RMSE and R² indicate reasonable prediction accuracy. There is no evidence of overfitting since the model performs well on unseen
data. Lower learning rates result in slow convergence, while higher learning rates may cause instability. An optimal learning rate ensures smooth convergence and minimal error

'''

