In [23]:
gl_map = {}

### Q1>1. Linear Regression 

In [24]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def LinearRegressionFunc(x_train, y_train, x_test, y_test):
    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model on the training data
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)

    # Calculate RMSE, MAE, and R-squared
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2
    # # Print the results
    # print(f'RMSE: {rmse:.4f}')
    # print(f'MAE: {mae:.4f}')
    # print(f'R-squared: {r2:.4f}')


In [25]:
import glob
import pandas as pd
gl_map = {}
folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]
for folder in folders:
    print(folder)
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)

    

    trmse = 0 
    tmae = 0
    tr2 = 0
    for train_file,test_file in zip(training_files, testing_files):
        df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
        df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
        # print(df)
        x_train = df.iloc[:,:-1]
        y_train = df.iloc[:,-1]
        x_test = df_test.iloc[:,:-1]
        y_test = df_test.iloc[:,-1]
        rmse, mae, r2 = LinearRegressionFunc(x_train,y_train,x_test,y_test)
        # print(r2)
        trmse+=rmse
        tmae+=mae
        tr2+=r2
    trmse/=5
    tmae/=5
    tr2/=5
    gl_map[(folder,1,0)] = (tmae,trmse,tr2)
    print(f"RMSE : {trmse}\nMAE : {tmae}\nr2_score : {tr2}\n") 

diabetes-5-fold
RMSE : 0.6275034933741999
MAE : 0.494077207332955
r2_score : -0.02270533734305826

machineCPU-5-fold
RMSE : 63.38092966076756
MAE : 40.08550007555207
r2_score : 0.827429222855281

mortgage-5-fold
RMSE : 0.12113615984788388
MAE : 0.08349366703499722
r2_score : 0.9984080525234482

plastic-5-fold
RMSE : 1.530470905327664
MAE : 1.2324659378443346
r2_score : 0.798323981892727

stock-5-fold
RMSE : 2.347664630867212
MAE : 1.838093466586368
r2_score : 0.870132731760774



### Q1>2. Polynomial Regression   

In [14]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def PolynomialRegressionFunc(x_train, y_train, x_test, y_test, degree):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model on the polynomial features
    model.fit(x_train_poly, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test_poly)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mae, rmse, r2

# Example usage:
# mae, rmse, r2 = PolynomialRegressionMetrics(x_train, y_train, x_test, y_test, degree)


In [26]:
import glob
import pandas as pd

folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]
for folder in folders:
    print(folder)
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)

    degree = [1,2,3]
    for d in degree:
        trmse = 0 
        tmae = 0
        tr2 = 0
        for train_file,test_file in zip(training_files, testing_files):
            df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
            df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
            # print(df)
            x_train = df.iloc[:,:-1]
            y_train = df.iloc[:,-1]
            x_test = df_test.iloc[:,:-1]
            y_test = df_test.iloc[:,-1]
            rae, mae, r2 = PolynomialRegressionFunc(x_train,y_train,x_test,y_test,d)
            # print(r2)
            trmse+=rmse
            tmae+=mae
            tr2+=r2
        trmse/=5
        tmae/=5
        tr2/=5
        print(f"degree : {d}")
        # print(f"{folder}:\n")
        gl_map[(folder,d,0)] = (tmae,trmse,tr2)
        print(f"RMSE : {trmse}\nMAE : {tmae}\nr2_score : {tr2}\n") 
    print("+++++++++++++++++++++++++++++++++++++++++++++++\n")

diabetes-5-fold
degree : 1
RMSE : 2.315207329252474
MAE : 0.6275034933742001
r2_score : -0.02270533734305884

degree : 2
RMSE : 2.315207329252474
MAE : 0.5472957597225531
r2_score : 0.23037469891948786

degree : 3
RMSE : 2.315207329252474
MAE : 1.0181883870263977
r2_score : -2.3483537333744224

+++++++++++++++++++++++++++++++++++++++++++++++

machineCPU-5-fold
degree : 1
RMSE : 2.315207329252474
MAE : 63.38092966076749
r2_score : 0.827429222855281

degree : 2
RMSE : 2.315207329252474
MAE : 111.6321957099212
r2_score : 0.4169487114312546

degree : 3
RMSE : 2.315207329252474
MAE : 425.71229045611517
r2_score : -9.570435624190653

+++++++++++++++++++++++++++++++++++++++++++++++

mortgage-5-fold
degree : 1
RMSE : 2.315207329252474
MAE : 0.12113615984788387
r2_score : 0.9984080525234482

degree : 2
RMSE : 2.315207329252474
MAE : 0.10820761705200665
r2_score : 0.9985365673712003

degree : 3
RMSE : 2.315207329252474
MAE : 2.307535884390398
r2_score : 0.1416365652878219

++++++++++++++++++++++

### Q1>3. Ridge Regularization

#### Alpha 
###### very high => Underfitting
###### medium => Perfect 
###### Very low => Overfitting

In [27]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def PolynomialRidgeRegressionMetrics(x_train, y_train, x_test, y_test, degree, alpha):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)

    
    # Create a Ridge Regression model
    model = Ridge(alpha=alpha)

    # Fit the model on the polynomial features
    model.fit(x_train_poly, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test_poly)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)


    return rmse, mae, r2


In [28]:
import glob
import pandas as pd

folders = ["diabetes-5-fold","machineCPU-5-fold","mortgage-5-fold","plastic-5-fold","stock-5-fold"]
for folder in folders:
    print(folder)
    # Define the file pattern
    file_pattern = "*tra.dat"  # Matches files with the pattern diabetes-5-*tra.dat
    # Use glob to find files that match the pattern
    training_files = glob.glob("./"+folder+"/"+file_pattern)
    file_pattern = "*tst.dat"  # Matches files with the pattern diabetes-5-*-tra.dat
    testing_files = glob.glob("./"+folder+"/"+file_pattern)

    # ans_map = {}
    alpha_values = [2**i for i in range(-18,51,2)]
    degree = [2,3]
    for d in degree:
        gl_map = {}
        # max_mae = -1, max_rmse
        for alpha in alpha_values:
            trmse = 0 
            tmae = 0
            tr2 = 0
            for train_file,test_file in zip(training_files, testing_files):
                df = pd.read_csv(train_file,delimiter=',', header=None, comment='@')
                df_test = pd.read_csv(test_file, delimiter=',', header=None, comment='@')
                # print(df)
                x_train = df.iloc[:,:-1]
                y_train = df.iloc[:,-1]
                x_test = df_test.iloc[:,:-1]
                y_test = df_test.iloc[:,-1]
                rmse, mae, r2 = PolynomialRidgeRegressionMetrics(x_train,y_train,x_test,y_test,d,alpha)
                # print(r2)
                trmse+=rmse
                tmae+=mae
                tr2+=r2
            trmse/=5
            tmae/=5
            tr2/=5
            # print(f"degree : {d} and alpha : {alpha}")
            gl_map[(folder,d,alpha)] = (trmse,tmae,tr2)
        # for k,v in gl_map:
        #     if()
            # key = (d, alpha)
            # ans_map[key] = (trmse, tmae, tr2)
            # print(f"{folder}:\n")
            # print(f"RMSE : {trmse}\nMAE : {tmae}\nr2_score : {tr2}\n") 
        # print("+++++++++++++++++++++++++++++++++++++++++++++++\n")

diabetes-5-fold
machineCPU-5-fold


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

mortgage-5-fold


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

plastic-5-fold


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

stock-5-fold


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

In [68]:
# Assuming ans_map is your dictionary
for key, values in ans_map.items():
    degree, alpha = key
    rmse, mae, r2 = values
    print(f"Degree: {degree:.4f}, Alpha: {alpha:.8f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Degree: 1.0000, Alpha: 0.00000381, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00000763, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00001526, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00003052, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00006104, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00012207, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00024414, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00048828, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00097656, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00195312, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00390625, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.00781250, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.01562500, RMSE: 2.3477, MAE: 1.8381, R2: 0.8701
Degree: 1.0000, Alpha: 0.03125000, RMSE: 2.3477, MA

: 

### Q2 Gradient Decent

In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Data
X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
Y = np.array([3, 4, 2, 5, 7])

# Gradient Descent Function
def gradient_descent(X, Y, learning_rate, epochs, alpha=None):
    m, n = X.shape
    theta = np.zeros((n, 1))
    cost_history = []

    for _ in range(epochs):
        # Hypothesis
        h = X.dot(theta)

        # Error
        error = h - Y

        # Regularization term for ridge regression
        regularization_term = 0 if alpha is None else alpha * np.sum(theta[1:]**2)

        # Update rule
        theta = theta - (learning_rate / m) * (X.T.dot(error) + regularization_term * np.vstack([0, theta[1:]]))

        # Cost function (mean squared error)
        cost = np.sum(error**2) / (2 * m) + regularization_term
        cost_history.append(cost)

    return theta, cost_history

# Function to calculate metrics
def calculate_metrics(X, Y, theta):
    h = X.dot(theta)
    rmse = np.sqrt(mean_squared_error(Y, h))
    mae = mean_absolute_error(Y, h)
    r2 = r2_score(Y, h)
    return rmse, mae, r2

# Add a column of ones to X for the bias term
X_bias = np.c_[np.ones((X.shape[0], 1)), X]

# Set hyperparameters
learning_rate = 0.01
epochs = 1000
alpha_values = [2**i for i in range(-18,10)]

# Perform gradient descent and calculate metrics for different alpha values
for alpha in alpha_values:
    theta, _ = gradient_descent(X_bias, Y.reshape(-1, 1), learning_rate, epochs, alpha)
    rmse, mae, r2 = calculate_metrics(X_bias, Y, theta)
    print(f"Alpha: {alpha}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Alpha: 3.814697265625e-06, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 7.62939453125e-06, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 1.52587890625e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 3.0517578125e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 6.103515625e-05, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.0001220703125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.000244140625, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.00048828125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.0009765625, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.001953125, RMSE: 1.1612, MAE: 0.9509, R2: 0.5445
Alpha: 0.00390625, RMSE: 1.1611, MAE: 0.9508, R2: 0.5445
Alpha: 0.0078125, RMSE: 1.1611, MAE: 0.9507, R2: 0.5445
Alpha: 0.015625, RMSE: 1.1610, MAE: 0.9504, R2: 0.5446
Alpha: 0.03125, RMSE: 1.1609, MAE: 0.9498, R2: 0.5447
Alpha: 0.0625, RMSE: 1.1607, MAE: 0.9488, R2: 0.5449
Alpha: 0.125, RMSE: 1.1602, MAE: 0.9467, R2: 0.5453
Alpha: 0.25, RMSE: 1.1594, MAE: 0.9427, R2: 0.5459
Alph