In [24]:
import numpy as np
import glob
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

### Q1>1. Linear Regression 

In [2]:
# Helper function to apply Linear Regression
def LinearRegressionFunc(x_train, y_train, x_test, y_test):
    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model on the training data
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)

    # Calculate RMSE, MAE, and R2 score
    rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # Root Mean Square Error
    mae = mean_absolute_error(y_test, y_pred) # Mean Absolute Error
    r2 = r2_score(y_test, y_pred) # R2 Score
    return rmse, mae, r2

# Example usage:
# rmse, mae, r2 = LinearRegressionFunc(x_train, y_train, x_test, y_test)

In [3]:
#global map for storing results of each dataset separately
gl_map = {}  # Key type = {dataset,degree,alpha}

#All datasets used:
folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]

#Loop structure 

# loop-1(dataset)
#            |-> loop-2(files)

#Double nested loops 
# 1. On folder/directory
        # 2.On each folder's five files (train and test)

#loop-1
for folder in folders:
    print(folder)
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    #same for test files
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)
    
    trmse = 0 #total of rmse
    tmae = 0 #total of mae
    tr2 = 0 #total of r2 score
    #loop-2
    for train_file,test_file in zip(training_files, testing_files): #zip reads both at the same time
        # delimiter = comma (as .dat file) and comment the lines start with '@' (as only info of file)  
        df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
        df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
        x_train = df.iloc[:,:-1]
        y_train = df.iloc[:,-1]
        x_test = df_test.iloc[:,:-1]
        y_test = df_test.iloc[:,-1]
        # Helper Linear Regression Function
        rmse, mae, r2 = LinearRegressionFunc(x_train,y_train,x_test,y_test)
        # print(r2)
        trmse+=rmse
        tmae+=mae
        tr2+=r2
    # Taking average of all 5 datasets for a particular folder
    trmse/=5
    tmae/=5
    tr2/=5
    # Putting the value in the map
    gl_map[(folder,1,0)] = (tmae,trmse,tr2) #dataset=current one, degree=1 (Linear Regression) and alpha = 0
    # alpha = hyperparameter
    print(f"RMSE : {trmse}\nMAE : {tmae}\nr2_score : {tr2}\n") 

diabetes-5-fold
RMSE : 0.6275034933741999
MAE : 0.494077207332955
r2_score : -0.02270533734305826

machineCPU-5-fold
RMSE : 63.38092966076756
MAE : 40.08550007555207
r2_score : 0.827429222855281

mortgage-5-fold
RMSE : 0.12113615984788388
MAE : 0.08349366703499722
r2_score : 0.9984080525234482

plastic-5-fold
RMSE : 1.530470905327664
MAE : 1.2324659378443346
r2_score : 0.798323981892727

stock-5-fold
RMSE : 2.347664630867212
MAE : 1.838093466586368
r2_score : 0.870132731760774



### Q1>2. Polynomial Regression   

In [4]:
# Helper function to apply Polynomial Regression and return mae, rmse, r2
def PolynomialRegressionFunc(x_train, y_train, x_test, y_test, degree):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)

    # Applying Linear Regression on updated features after Polynomial Features fitting 
    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model on the polynomial features
    model.fit(x_train_poly, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test_poly)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mae, rmse, r2

# Example usage:
# mae, rmse, r2 = PolynomialRegressionFunc(x_train, y_train, x_test, y_test, degree)

In [5]:
#Taking all datasets in a folders for iterating
folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]

#Loop structure 

# loop-1(dataset)
#            |-> loop-2(degree)
#                         |-> loop-3(files)

# loop-1 on datasets
for folder in folders:
    print(folder)
    # File pattern matching
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)
    
    # loop-2 on degree
    degree = [1,2,3]
    for d in degree:
        trmse = 0 
        tmae = 0
        tr2 = 0
        # loop-3 on training and testing files:
        for train_file,test_file in zip(training_files, testing_files):
            #reading train and test files as dataframes
            df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
            df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
            #first n-1 as features and last one as target 
            x_train = df.iloc[:,:-1]
            y_train = df.iloc[:,-1]
            x_test = df_test.iloc[:,:-1]
            y_test = df_test.iloc[:,-1]
            mae, rmse, r2 = PolynomialRegressionFunc(x_train,y_train,x_test,y_test,d)
            #Taking sum for a particular folder
            trmse+=rmse
            tmae+=mae
            tr2+=r2
        #Taking average for a particular folder
        trmse/=5
        tmae/=5
        tr2/=5
        print(f"degree : {d}")
        # print(f"{folder}:\n")
        gl_map[(folder,d,0)] = (tmae,trmse,tr2)
        print(f"RMSE : {trmse}\nMAE : {tmae}\nr2_score : {tr2}\n") 
    #Used as separator
    print("+++++++++++++++++++++++++++++++++++++++++++++++\n")

diabetes-5-fold
degree : 1
RMSE : 0.6275034933742001
MAE : 0.4940772073329551
r2_score : -0.02270533734305884

degree : 2
RMSE : 0.5472957597225531
MAE : 0.45727247144417477
r2_score : 0.23037469891948786

degree : 3
RMSE : 1.0181883870263977
MAE : 0.7174549374896216
r2_score : -2.3483537333744224

+++++++++++++++++++++++++++++++++++++++++++++++

machineCPU-5-fold
degree : 1
RMSE : 63.38092966076749
MAE : 40.08550007555239
r2_score : 0.827429222855281

degree : 2
RMSE : 111.6321957099212
MAE : 56.47223787750264
r2_score : 0.4169487114312546

degree : 3
RMSE : 425.71229045611517
MAE : 138.90750914948543
r2_score : -9.570435624190653

+++++++++++++++++++++++++++++++++++++++++++++++

mortgage-5-fold
degree : 1
RMSE : 0.12113615984788387
MAE : 0.08349366703499836
r2_score : 0.9984080525234482

degree : 2
RMSE : 0.10820761705200665
MAE : 0.05575064547471574
r2_score : 0.9985365673712003

degree : 3
RMSE : 2.307535884390398
MAE : 0.499848937630815
r2_score : 0.1416365652878219

+++++++++++++

### Q1>3. Ridge Regularization

#### Alpha 
###### very high => Underfitting
###### medium => Perfect 
###### Very low => Overfitting

In [35]:
# Helper function to apply Polynomial Ridge Regularization and return rmse, mae, r2:
def PolynomialRidgeRegressionMetrics(x_train, y_train, x_test, y_test, degree, alpha):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)

    
    # Create a Ridge Regression model
    model = Ridge(alpha=alpha)

    # Fit the model on the polynomial features
    model.fit(x_train_poly, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test_poly)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return rmse, mae, r2

# Example usage:
# mae, rmse, r2 = PolynomialRegressionMetrics(x_train, y_train, x_test, y_test, degree)

In [36]:
import warnings
warnings.filterwarnings("ignore")
#Loop structure 

# loop-1(dataset)
#            |-> loop-2(degree)
#                         |-> loop-3(alpha)
#                                     |-> loop-4(trainging and testing files)
ans_map = {}
folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]
#loop-1 (datasets)
for folder in folders:
    print(folder)
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)
    
    #Storing alpha values from 2^(-18),2^(-16),2^(-14), .... , 2^(50)
    alpha_values = [2**i for i in range(-18,51,2)]
    degree = [2,3]
    #loop-2 (degree)
    for d in degree:
        gl_map = {}
        #loop-3 (alpha value)
        for alpha in alpha_values:
            trmse = 0 
            tmae = 0
            tr2 = 0
            #loop-4 (training and testing files in datasets)
            for train_file,test_file in zip(training_files, testing_files):
                df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
                df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
                # print(df)
                x_train = df.iloc[:,:-1]
                y_train = df.iloc[:,-1]
                x_test = df_test.iloc[:,:-1]
                y_test = df_test.iloc[:,-1]
                rmse, mae, r2 = PolynomialRidgeRegressionMetrics(x_train,y_train,x_test,y_test,d,alpha)
                # print(r2)
                trmse+=rmse
                tmae+=mae
                tr2+=r2
            trmse/=5
            tmae/=5
            tr2/=5
            ans_map[(folder,d,alpha)] = (trmse,tmae,tr2)

diabetes-5-fold
machineCPU-5-fold
mortgage-5-fold
plastic-5-fold
stock-5-fold


In [37]:
# Printing answer from the previous ans_map
for key, values in ans_map.items():
    datasets, degree, alpha = key
    rmse, mae, r2 = values
    print(f"Degree: {degree:.4f}, Alpha: {alpha:.8f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Degree: 2.0000, Alpha: 0.00000381, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.00001526, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.00006104, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.00024414, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.00097656, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.00390625, RMSE: 0.5473, MAE: 0.4573, R2: 0.2304
Degree: 2.0000, Alpha: 0.01562500, RMSE: 0.5472, MAE: 0.4572, R2: 0.2305
Degree: 2.0000, Alpha: 0.06250000, RMSE: 0.5470, MAE: 0.4571, R2: 0.2308
Degree: 2.0000, Alpha: 0.25000000, RMSE: 0.5461, MAE: 0.4566, R2: 0.2320
Degree: 2.0000, Alpha: 1.00000000, RMSE: 0.5438, MAE: 0.4547, R2: 0.2349
Degree: 2.0000, Alpha: 4.00000000, RMSE: 0.5434, MAE: 0.4498, R2: 0.2302
Degree: 2.0000, Alpha: 16.00000000, RMSE: 0.5636, MAE: 0.4565, R2: 0.1821
Degree: 2.0000, Alpha: 64.00000000, RMSE: 0.6092, MAE: 0.4929, R2: 0.0691
Degree: 2.0000, Alpha: 256.00000000, RMSE: 0.6721

### Q2 Gradient Decent

In [25]:
# Data (Creating data)
# Reshape your data using array.reshape(-1, 1) if your data has a single feature
X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) # Converting to column form with one column
Y = np.array([3, 4, 2, 5, 7])

# Planning :
# Consider X now as matrix (We also added column 1 below before calling function)
# Dimension of X = Number of data points * 2 (Reason: y = (w1)x + w0)
# We are maintaining w1 and w0 in theta 

# Gradient Descent Function
def gradient_descent(X, Y, learning_rate, epochs, alpha=None):
    m, n = X.shape
    # Making 
    theta = np.zeros((n, 1))
    cost_history = [] # Maintaining cost function value for all iterations (epochs)

    for _ in range(epochs):
        # Hypothesis for calculating Y (Matrix way) Y = X.theta
        h = X.dot(theta)

        # Error = predicted values - true value 
        error = h - Y # sum of rediduals

        # Regularization term for ridge regression 
        # if nothing is passed then default value as 0
        regularization_term = 0 if alpha is None else alpha * np.sum(theta[1:]**2)
        
        # Update rule : https://www.youtube.com/watch?v=6v3r9KPM2t0 (watch this to understand this)
        #    W(New)  = W(Old) - (learning_rate)*(dJ/dW) 
        #    Only needs to penalize theta from 1 to above and not 0 (Therefore np.vstack 0  then put all other values)
        theta = theta - (learning_rate / m) * (X.T.dot(error) + regularization_term * np.vstack([0, theta[1:]]))
        # X.T = Transpose of X
        
        # Cost function (mean squared error)
        cost = np.sum(error**2) / (2 * m) + regularization_term
        cost_history.append(cost)

    return theta, cost_history

# Function to calculate metrics
def calculate_metrics(X, Y, theta):
    h = X.dot(theta) # Predicted Y
    rmse = np.sqrt(mean_squared_error(Y, h))
    mae = mean_absolute_error(Y, h)
    r2 = r2_score(Y, h)
    return rmse, mae, r2


# Add a column of ones to X for the bias term (after the last column)
X_bias = np.c_[np.ones((X.shape[0], 1)), X] # Same number of rows but single column of 1
# Set hyperparameters
learning_rate = 0.01
epochs = 1000
alpha_values = [2**i for i in range(-18,10)]

# Perform gradient descent and calculate metrics for different alpha values
for alpha in alpha_values:
    theta, _ = gradient_descent(X_bias, Y.reshape(-1, 1), learning_rate, epochs, alpha)
    rmse, mae, r2 = calculate_metrics(X_bias, Y, theta)
    print(f"Alpha: {alpha}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Alpha: 3.814697265625e-06, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 7.62939453125e-06, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 1.52587890625e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 3.0517578125e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 6.103515625e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.0001220703125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.000244140625, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.00048828125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.0009765625, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.001953125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.00390625, RMSE: 1.1611, MAE: 0.9508, R2: 0.5445
Alpha: 0.0078125, RMSE: 1.1611, MAE: 0.9507, R2: 0.5445
Alpha: 0.015625, RMSE: 1.1610, MAE: 0.9504, R2: 0.5446
Alpha: 0.03125, RMSE: 1.1609, MAE: 0.9498, R2: 0.5447
Alpha: 0.0625, RMSE: 1.1607, MAE: 0.9488, R2: 0.5449
Alpha: 0.125, RMSE: 1.1602, MAE: 0.9467, R2: 0.5453
Alpha: 0.25, RMSE: 1.1594, MAE: 0.9427, R2: 0.5459
Alph

In [30]:
# Complex code example to understand their meaning 
# dot for 2D input is matrix multiplication, not a dot product. What you're seeing is just the result of the normal rules of matrix multiplication. If you want a vector dot product, the easiest way is to use 1D vectors, with no superfluous second dimension:
X = np.array([1, 2, 3])
THETA = np.array([1, 2, 3])
print(X.dot(THETA))
# dot-ting two 1D arrays takes a dot product and produces a scalar result.

# Stack arrays vertically over each other
print(np.vstack([[0,0], THETA[1:]]))

14
[[0 0]
 [2 3]]
