In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Data Understanding, Analysis, and Preparation
def load_and_prepare_data(file_path):
    """
    Load and prepare the dataset.

    Parameters:
    file_path (str): Path to the CSV dataset file.

    Returns:
    tuple: Feature and target data split into training and testing sets.
    """
    # Load the dataset
    data = pd.read_csv("/content/drive/MyDrive/Dataset/student.csv")

    # Observe the dataset
    print("Dataset Head:\n", data.head())
    print("Dataset Tail:\n", data.tail())
    print("Dataset Info:\n")
    data.info()
    print("Descriptive Stats:\n", data.describe())

    # Split into features (X) and target (Y)
    X = data[['Math', 'Reading']].values
    Y = data['Writing'].values

    # Split into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    return X_train, X_test, Y_train, Y_test

# Step 2: Cost Function
def cost_function(X, Y, W):
    """
    Calculates Mean Squared Error (MSE)
    """
    m = len(Y)
    predictions = np.dot(X, W)
    cost = (1 / (2 * m)) * np.sum((predictions - Y) ** 2)
    return cost

# Step 3: Gradient Descent
def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize weights.
    """
    m = len(Y)
    cost_history = []

    for i in range(iterations):
        predictions = np.dot(X, W)
        loss = predictions - Y
        gradients = (1 / m) * np.dot(X.T, loss)
        W -= alpha * gradients

        cost = cost_function(X, Y, W)
        cost_history.append(cost)

        if i % 100 == 0:
            print(f"Iteration {i}: Cost {cost}")

    return W, cost_history

# Step 4: Evaluate the Model
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

def r2(Y, Y_pred):
    ss_res = np.sum((Y - Y_pred) ** 2)
    ss_tot = np.sum((Y - np.mean(Y)) ** 2)
    return 1 - (ss_res / ss_tot)

# Step 5: Main Function
def main():
    # File path to the dataset
    file_path = "/content/drive/MyDrive/Dataset/medical_students_dataset.csv"  # Update the correct file path

    # Load and prepare data
    X_train, X_test, Y_train, Y_test = load_and_prepare_data(file_path)

    # Add bias term (intercept) to the feature matrix
    X_train = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test = np.c_[np.ones(X_test.shape[0]), X_test]

    # Initialize weights, learning rate, and iterations
    W = np.zeros(X_train.shape[1])
    alpha = 0.01
    iterations = 1000

    # Perform gradient descent
    W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)

    # Make predictions
    Y_pred = np.dot(X_test, W_optimal)

    # Evaluate the model
    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    # Output results
    print("Optimal Weights:", W_optimal)
    print("Final Cost:", cost_history[-1])
    print("RMSE:", model_rmse)
    print("R-Squared:", model_r2)

if __name__ == "__main__":
    main()


Dataset Head:
    Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62
Dataset Tail:
      Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72
Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB
Descriptive Stats:
               Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.872000    68.616000
std      15.085008    14.657027    15.241287
min      13.000000    19.000000    14.000000
25%      58.000000    60.750000    58.000000
50%      68.00000

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  cost = (1 / (2 * m)) * np.sum((predictions - Y) ** 2)
  W -= alpha * gradients
