In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

file_path='student.csv'

### Data Understanding, Analysis and Preparations:
#### To-Do-1
#### 1. Read and Observe the Dataset.
#### 2. Print top(5) and bottom(5) of the dataset {Hint: pd.head and pd.tail}.


In [31]:
student_data = pd.read_csv(file_path)
student_data.head()

Unnamed: 0,Math,Reading,Writing
0,48,68,63
1,62,81,72
2,79,80,78
3,76,83,79
4,59,64,62


In [32]:
student_data.tail()

Unnamed: 0,Math,Reading,Writing
995,72,74,70
996,73,86,90
997,89,87,94
998,83,82,78
999,66,66,72


#### 3. Print the Information of Datasets. {Hint: pd.info}.

In [33]:
student_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


#### 4. Gather the Descriptive info about the Dataset. {Hint: pd.describe}

In [34]:
student_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Math,1000.0,67.29,15.085008,13.0,58.0,68.0,78.0,100.0
Reading,1000.0,69.872,14.657027,19.0,60.75,70.0,81.0,100.0
Writing,1000.0,68.616,15.241287,14.0,58.0,69.5,79.0,100.0


#### 5. Split your data into Feature (X) and Label (Y).

In [35]:
# Split data into features (X) and target (Y)
X = student_data[['Math', 'Reading']].values  # Features: Math and Reading marks
Y = student_data['Writing'].values            # Target: Writing marks

# Split data into training and testing sets (80-20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("\nData split completed.")



Data split completed.


### To-Do-2:
#### 1.To make the task easier - let’s assume there is no bias or intercept.
#### 2.Create the following matrices:

In [36]:
import numpy as np

# Example dataset
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])  # Features
Y = np.array([1, 0, 1, 0, 1])  # Target

# Define the manual train-test split function
def train_test_split_manual(X, Y, test_size=0.3):
    n = len(X)
    split_index = int(n * (1 - test_size))
    indices = np.random.permutation(n)
    X_train, X_test = X[indices[:split_index]], X[indices[split_index:]]
    Y_train, Y_test = Y[indices[:split_index]], Y[indices[split_index:]]
    return X_train, X_test, Y_train, Y_test

# Call the function
X_train, X_test, Y_train, Y_test = train_test_split_manual(X, Y)

print("X_train:", X_train)
print("X_test:", X_test)
print("Y_train:", Y_train)
print("Y_test:", Y_test)


X_train: [[3 4]
 [7 8]
 [1 2]]
X_test: [[ 9 10]
 [ 5  6]]
Y_train: [0 0 1]
Y_test: [1 1]


In [37]:
import numpy as np

def cost_function(X, Y, W):
    """
    Calculate the Mean Squared Error (MSE) cost function.

    Parameters:
        X (numpy.ndarray): Feature matrix of shape (m, n), where m is the number of samples and n is the number of features.
        Y (numpy.ndarray): Target matrix of shape (m, 1).
        W (numpy.ndarray): Weight matrix of shape (n, 1).

    Returns:
        float: The mean squared error cost.
    """
    # Ensure inputs are numpy arrays
    X = np.array(X)
    Y = np.array(Y)
    W = np.array(W)

    # Validate input dimensions
    if X.shape[0] != Y.shape[0]:
        raise ValueError("The number of rows in X and Y must match.")
    if X.shape[1] != W.shape[0]:
        raise ValueError("The number of columns in X must match the number of rows in W.")
    
    m = len(Y)  # Number of samples
    Y_pred = np.dot(X, W)  # Predicted values
    cost = (1 / (2 * m)) * np.sum((Y_pred - Y) ** 2)  # Mean squared error
    return cost


In [38]:
X_test = np.array([[1, 2], [3, 4], [5, 6]])
Y_test = np.array([3, 7, 11])
W_test = np.array([1, 1])
cost = cost_function(X_test, Y_test, W_test)
if cost == 0:
  print("Proceed Further")
else:
  print("something went wrong: Reimplement a cost function")
print("Cost function output:", cost_function(X_test, Y_test, W_test))

Proceed Further
Cost function output: 0.0


In [46]:
import numpy as np

def cost_function(X, Y, W):
    """
    Compute the cost function (Mean Squared Error) for linear regression.

    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Parameters (n x 1).

    Returns:
    float: The computed cost.
    """
    m = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1 / (2 * m)) * np.sum((Y_pred - Y) ** 2)
    return cost

def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize the parameters of a linear regression model.

    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Initial guess for parameters (n x 1).
    alpha (float): Learning rate.
    iterations (int): Number of iterations for gradient descent.

    Returns:
    tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values:
           - W_update (numpy.ndarray): Updated parameters (n x 1).
           - cost_history (list): History of cost values over iterations.
    """
    m = len(Y)
    cost_history = []

    for _ in range(iterations):
        Y_pred = np.dot(X, W)
        loss = Y_pred - Y
        gradient = (1 / m) * np.dot(X.T, loss)
        W -= alpha * gradient
        cost_history.append(cost_function(X, Y, W))

    return W, cost_history


In [40]:
# Generate random test data
np.random.seed(0) # For reproducibility
X = np.random.rand(100, 3) # 100 samples, 3 features
Y = np.random.rand(100)
W = np.random.rand(3) # Initial guess for parameters
# Set hyperparameters
alpha = 0.01
iterations = 1000
# Test the gradient_descent function
final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)
# Print the final parameters and cost history
print("Final Parameters:", final_params)
print("Cost History:", cost_history)

Final Parameters: [0.20551667 0.54295081 0.10388027]
Cost History: [np.float64(0.10711197094660153), np.float64(0.10634880599939901), np.float64(0.10559826315680618), np.float64(0.10486012948320558), np.float64(0.1041341956428534), np.float64(0.10342025583900626), np.float64(0.1027181077540776), np.float64(0.1020275524908062), np.float64(0.10134839451441931), np.float64(0.1006804415957737), np.float64(0.1000235047554587), np.float64(0.09937739820884377), np.float64(0.09874193931205609), np.float64(0.09811694850887098), np.float64(0.09750224927850094), np.float64(0.0968976680842672), np.float64(0.09630303432313951), np.float64(0.09571818027612913), np.float64(0.09514294105952065), np.float64(0.09457715457692842), np.float64(0.09402066147216397), np.float64(0.09347330508290017), np.float64(0.09293493139511913), np.float64(0.09240538899833017), np.float64(0.09188452904154543), np.float64(0.0913722051899995), np.float64(0.09086827358260123), np.float64(0.09037259279010502), np.float64(0.08

In [47]:
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

In [48]:
def r2(Y, Y_pred):
    ss_res = np.sum((Y - Y_pred) ** 2)
    ss_tot = np.sum((Y - np.mean(Y)) ** 2)
    return 1 - (ss_res / ss_tot)

In [43]:
def main():
    # Add a bias term (intercept) to the feature matrix
    X_train_with_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_with_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Initialize weights, learning rate, and iterations
    W = np.zeros(X_train_with_bias.shape[1])
    alpha = 0.0001
    iterations = 1000

    # Train model using gradient descent
    W_optimal, cost_history = gradient_descent(X_train_with_bias, Y_train, W, alpha, iterations)

    # Make predictions
    Y_pred_train = np.dot(X_train_with_bias, W_optimal)
    Y_pred_test = np.dot(X_test_with_bias, W_optimal)

    # Evaluate the model
    train_rmse = rmse(Y_train, Y_pred_train)
    test_rmse = rmse(Y_test, Y_pred_test)
    train_r2 = r2(Y_train, Y_pred_train)
    test_r2 = r2(Y_test, Y_pred_test)

    # Print results
    print("Final Weights (including bias):", W_optimal)
    print("Train RMSE:", train_rmse)
    print("Test RMSE:", test_rmse)
    print("Train R²:", train_r2)
    print("Test R²:", test_r2)


In [44]:
if __name__ == "__main__":
    main()

Final Weights (including bias): [ 0.02535061 -0.00487278  0.02047783]
Train RMSE: 0.551825669498326
Test RMSE: 7.629707415731344
Train R²: -0.3703020628277407
Test R²: -4.457415804656176
