# Linear regression order 1 without regularization

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

folders = ["diabetes", "machineCPU", "mortgage", "plastic", "stock"]
noofrows = [7, 11, 20, 7, 12]
datasets = ['Dataset-1', 'Dataset-2', 'Dataset-3', 'Dataset-4', 'Dataset-5']


def linear_regression(folders, noofrows):
    for folder, rowskipped in zip(folders, noofrows):
        RMSE, MAE, R2 = [], [], []
        for j in range(1, 6):
            train_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tra.dat"
            test_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tst.dat"

            train_data = pd.read_csv(train_path, skiprows=rowskipped)
            test_data = pd.read_csv(test_path, skiprows=rowskipped)
            X_train = train_data.iloc[:, :-1].values
            y_train = train_data.iloc[:, :-1].values
            X_test = test_data.iloc[:, :-1].values
            y_test = test_data.iloc[:, :-1].values
            model = LinearRegression()
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)

            rmse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            RMSE.append(rmse)
            MAE.append(mae)
            R2.append(r2)
        print(folder + ":")
        data = {
            'Datasets': datasets,
            'RMSE Score': RMSE,
            'MAE Score': MAE,
            'R2 Score': R2
        }
        df = pd.DataFrame(data)
        print(df)
        print("Average RMSE:", np.mean(RMSE))
        print("Average MAE:", np.mean(MAE))
        print("Average R-squared:", np.mean(R2))
        print("\n")


linear_regression(folders, noofrows)



diabetes:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  7.028152e-30  2.315856e-15       1.0
1  Dataset-2  9.051100e-30  2.171874e-15       1.0
2  Dataset-3  1.309632e-30  5.689893e-16       1.0
3  Dataset-4  1.311833e-30  8.405974e-16       1.0
4  Dataset-5  1.028337e-30  7.295751e-16       1.0
Average RMSE: 3.945810878454419e-30
Average MAE: 1.3253782991741655e-15
Average R-squared: 1.0


machineCPU:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  6.904320e-24  1.236501e-12       1.0
1  Dataset-2  2.884478e-24  5.842555e-13       1.0
2  Dataset-3  8.329149e-23  2.854992e-12       1.0
3  Dataset-4  7.805198e-24  1.092840e-12       1.0
4  Dataset-5  4.744299e-25  2.864844e-13       1.0
Average RMSE: 2.0271983921025697e-23
Average MAE: 1.2110144578071918e-12
Average R-squared: 1.0


mortgage:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  2.638186e-25  2.615156e-13       1.0
1  Dataset-2  1.790379e-25  2.224830e-13       1.0
2  Datas

# Linear regression order 2 without regularization

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

folders = ["diabetes", "machineCPU", "mortgage", "plastic", "stock"]
noofrows = [7, 11, 20, 7, 12]
datasets = ['Dataset-1', 'Dataset-2', 'Dataset-3', 'Dataset-4', 'Dataset-5']

def linear_regression(folders, noofrows):
    for folder, rowskipped in zip(folders, noofrows):
        RMSE, MAE, R2 = [], [], []
        for j in range(1, 6):
            train_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tra.dat"
            test_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tst.dat"

            train_data = pd.read_csv(train_path, skiprows=rowskipped)
            test_data = pd.read_csv(test_path, skiprows=rowskipped)
            X_train = train_data.iloc[:, :-1].values
            y_train = train_data.iloc[:, :-1].values
            X_test = test_data.iloc[:, :-1].values
            y_test = test_data.iloc[:, :-1].values

            poly = PolynomialFeatures(degree=2)
            X_train_poly2 = poly.fit_transform(X_train)
            X_test_poly2 = poly.transform(X_test)

            model_poly2 = LinearRegression()
            model_poly2.fit(X_train_poly2, y_train)

            y_pred = model_poly2.predict(X_test_poly2)

            rmse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            RMSE.append(rmse)
            MAE.append(mae)
            R2.append(r2)
        print(folder + ":")
        data = {
            'Datasets': datasets,
            'RMSE Score': RMSE,
            'MAE Score': MAE,
            'R2 Score': R2
        }
        df = pd.DataFrame(data)
        print(df)
        print("Average RMSE:", np.mean(RMSE))
        print("Average MAE:", np.mean(MAE))
        print("Average R-squared:", np.mean(R2))
        print("\n")


linear_regression(folders, noofrows)



diabetes:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  8.488079e-29  7.540843e-15       1.0
1  Dataset-2  5.593517e-29  6.772360e-15       1.0
2  Dataset-3  1.644659e-28  1.067896e-14       1.0
3  Dataset-4  1.110542e-28  8.651809e-15       1.0
4  Dataset-5  9.331450e-29  7.438494e-15       1.0
Average RMSE: 1.019301160638794e-28
Average MAE: 8.216492962200205e-15
Average R-squared: 1.0


machineCPU:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  3.961015e-15  2.490453e-08       1.0
1  Dataset-2  1.932732e-15  1.590818e-08       1.0
2  Dataset-3  2.796799e-15  1.703457e-08       1.0
3  Dataset-4  1.562729e-15  1.097619e-08       1.0
4  Dataset-5  1.002618e-14  3.137066e-08       1.0
Average RMSE: 4.0558916831652005e-15
Average MAE: 2.0038827090044624e-08
Average R-squared: 1.0


mortgage:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  1.185570e-18  5.333938e-10       1.0
1  Dataset-2  2.988724e-18  5.017740e-10       1.0
2  Datase

# Linear regression order 3 without regularization

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

folders = ["diabetes", "machineCPU", "mortgage", "plastic", "stock"]
noofrows = [7, 11, 20, 7, 12]
datasets = ['Dataset-1', 'Dataset-2', 'Dataset-3', 'Dataset-4', 'Dataset-5']

def linear_regression(folders, noofrows):
    for folder, rowskipped in zip(folders, noofrows):
        RMSE, MAE, R2 = [], [], []
        for j in range(1, 6):
            train_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tra.dat"
            test_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tst.dat"

            train_data = pd.read_csv(train_path, skiprows=rowskipped)
            test_data = pd.read_csv(test_path, skiprows=rowskipped)
            X_train = train_data.iloc[:, :-1].values
            y_train = train_data.iloc[:, :-1].values
            X_test = test_data.iloc[:, :-1].values
            y_test = test_data.iloc[:, :-1].values

            poly = PolynomialFeatures(degree=3)
            X_train_poly2 = poly.fit_transform(X_train)
            X_test_poly2 = poly.transform(X_test)

            model_poly2 = LinearRegression()
            model_poly2.fit(X_train_poly2, y_train)

            y_pred = model_poly2.predict(X_test_poly2)

            rmse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            RMSE.append(rmse)
            MAE.append(mae)
            R2.append(r2)
        print(folder + ":")
        data = {
            'Datasets': datasets,
            'RMSE Score': RMSE,
            'MAE Score': MAE,
            'R2 Score': R2
        }
        df = pd.DataFrame(data)
        print(df)
        print("Average RMSE:", np.mean(RMSE))
        print("Average MAE:", np.mean(MAE))
        print("Average R-squared:", np.mean(R2))
        print("\n")


linear_regression(folders, noofrows)



diabetes:
    Datasets    RMSE Score     MAE Score  R2 Score
0  Dataset-1  7.750031e-26  2.279184e-13       1.0
1  Dataset-2  8.655339e-27  6.483702e-14       1.0
2  Dataset-3  4.428990e-26  1.450229e-13       1.0
3  Dataset-4  1.360410e-25  2.354307e-13       1.0
4  Dataset-5  8.835161e-27  8.877026e-14       1.0
Average RMSE: 5.506434280098845e-26
Average MAE: 1.5239585387278558e-13
Average R-squared: 1.0


machineCPU:
    Datasets  RMSE Score  MAE Score  R2 Score
0  Dataset-1    0.006017   0.026402  0.999978
1  Dataset-2    0.038609   0.053826  0.999985
2  Dataset-3    0.042438   0.047974  0.999930
3  Dataset-4    0.999449   0.218216  0.995077
4  Dataset-5    0.038803   0.052551  0.999864
Average RMSE: 0.22506323809772844
Average MAE: 0.07979383904720586
Average R-squared: 0.9989667392612397


mortgage:
    Datasets  RMSE Score  MAE Score  R2 Score
0  Dataset-1    0.000022   0.001748  1.000000
1  Dataset-2    0.000850   0.003388  0.999999
2  Dataset-3    0.000040   0.001776  1.00000

# Regression with Ridge Regularization

In [4]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



folders = ["diabetes", "machineCPU", "mortgage", "plastic", "stock"]
noofrows = [7, 11, 20, 7, 12]
datasets = ['Dataset-1', 'Dataset-2', 'Dataset-3', 'Dataset-4', 'Dataset-5']

def linear_regression(folders, noofrows):
    for folder, rowskipped in zip(folders, noofrows):
        RMSE, MAE, R2 = [], [], []
        best_c_value_rmse = float('inf')
        best_d_value_rmse = None
        best_rmse = float('inf')
        best_c_value_mae = float('inf')
        best_d_value_mae = None
        best_mae = float('inf')
        best_c_value_r2 = float('inf')
        best_d_value_r2 = None
        best_r2 = float('-inf')
        for j in range(1, 6):
            train_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tra.dat"
            test_path = str(folder) + "/" + str(folder) + "-5-" + str(j) + "tst.dat"
            train_data = pd.read_csv(train_path, skiprows=rowskipped)
            test_data = pd.read_csv(test_path, skiprows=rowskipped)
            
            X_train = train_data.iloc[:, :-1].values
            y_train = train_data.iloc[:, -1].values
            
            X_test = test_data.iloc[:, :-1].values
            y_test = test_data.iloc[:, -1].values
 
            for d in (1, 3):
                 poly = PolynomialFeatures(degree=d)
                 X_train_poly2 = poly.fit_transform(X_train)
                 X_test_poly2 = poly.transform(X_test)
                 alphas = [2 ** i for i in range(-18, 51, 2)]
                 
                 for alpha in alphas:
                     model_ridge = Ridge(alpha=alpha)
                     model_ridge.fit(X_train_poly2, y_train)
                     y_pred = model_ridge.predict(X_test_poly2)
                     rmse_ridge = mean_squared_error(y_test, y_pred)
                     mae_ridge = mean_absolute_error(y_test, y_pred)
                     r2_ridge = r2_score(y_test, y_pred)
                     
                     if rmse_ridge < best_rmse:
                         best_rmse = rmse_ridge
                         best_c_value_rmse = alpha
                         best_d_value_rmse = d
                     
                     if mae_ridge < best_mae:
                         best_mae = mae_ridge
                         best_c_value_mae = alpha
                         best_d_value_mae = d
                     
                     if r2_ridge > best_r2:
                         best_r2 = r2_ridge
                         best_c_value_r2 = alpha
                         best_d_value_r2 = d
 
            print(folder + ":")
            print("Best RMSE: ", best_rmse, " c:", best_c_value_rmse, "d: ", best_d_value_rmse)
            print("Best MAE: ", best_mae, " c:", best_c_value_mae, "d: ", best_d_value_mae)
            print("Best R2: ", best_r2, " c:", best_c_value_r2, "d: ", best_d_value_r2)


linear_regression(folders, noofrows)

diabetes:
Best RMSE:  0.3179790019204956  c: 16384 d:  1
Best MAE:  0.4695362724851896  c: 16384 d:  1
Best R2:  -0.02781091529857238  c: 16384 d:  1
diabetes:
Best RMSE:  0.19539032882408916  c: 16 d:  3
Best MAE:  0.3750027441972923  c: 4 d:  3
Best R2:  0.26914196114893607  c: 16 d:  3
diabetes:
Best RMSE:  0.19539032882408916  c: 16 d:  3
Best MAE:  0.3750027441972923  c: 4 d:  3
Best R2:  0.46046171463515106  c: 3.814697265625e-06 d:  1
diabetes:
Best RMSE:  0.19539032882408916  c: 16 d:  3
Best MAE:  0.3750027441972923  c: 4 d:  3
Best R2:  0.46046171463515106  c: 3.814697265625e-06 d:  1
diabetes:
Best RMSE:  0.19539032882408916  c: 16 d:  3
Best MAE:  0.3750027441972923  c: 4 d:  3
Best R2:  0.46046171463515106  c: 3.814697265625e-06 d:  1
machineCPU:
Best RMSE:  574.7464546104728  c: 1125899906842624 d:  3
Best MAE:  16.33732595537046  c: 1125899906842624 d:  3
Best R2:  0.9619467565787428  c: 1125899906842624 d:  3
machineCPU:
Best RMSE:  574.7464546104728  c: 112589990684262

### Write a program to solve the following regression problem using gradient descent method.
#### Find RMSE, MAE and coefficient of determination. Also study the impact of ridge regularization

In [5]:
import numpy as np

def gradient_descent(X, Y, learning_rate, epochs, regularization_strength=None):
    m = len(X)
    theta0 = 0
    theta1 = 0
    
    for epoch in range(epochs):
        predictions = theta0 + theta1 * X
        errors = predictions - Y
        
        if regularization_strength:
            theta0 -= (learning_rate / m) * (np.sum(errors) + regularization_strength * theta0)
            theta1 -= (learning_rate / m) * (np.dot(errors, X) + regularization_strength * theta1)
        else:
            theta0 -= (learning_rate / m) * np.sum(errors)
            theta1 -= (learning_rate / m) * np.dot(errors, X)
    
    return theta0, theta1

def calculate_metrics(Y, predictions):
    rmse = np.sqrt(np.mean((predictions - Y) ** 2))
    mae = np.mean(np.abs(predictions - Y))
    
    mean_y = np.mean(Y)
    total_variance = np.sum((Y - mean_y) ** 2)
    explained_variance = np.sum((predictions - mean_y) ** 2)
    r_squared = explained_variance / total_variance
    
    return rmse, mae, r_squared

def main():
    # Data
    X = np.array([1, 2, 3, 4, 5])
    Y = np.array([3, 4, 5, 7, 5])

    # Hyperparameters
    learning_rate = 0.01
    epochs = 1000

    # Without Ridge Regularization
    theta0, theta1 = gradient_descent(X, Y, learning_rate, epochs)
    predictions = theta0 + theta1 * X
    rmse, mae, r_squared = calculate_metrics(Y, predictions)

    print("Without Ridge Regularization:")
    print("Theta0:", theta0)
    print("Theta1:", theta1)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R-squared:", r_squared)

    # With Ridge Regularization
    regularization_strength = 0.1
    theta0_ridge, theta1_ridge = gradient_descent(X, Y, learning_rate, epochs, regularization_strength)
    predictions_ridge = theta0_ridge + theta1_ridge * X
    rmse_ridge, mae_ridge, r_squared_ridge = calculate_metrics(Y, predictions_ridge)

    print("\nWith Ridge Regularization:")
    print("Theta0 (Ridge):", theta0_ridge)
    print("Theta1 (Ridge):", theta1_ridge)
    print("RMSE (Ridge):", rmse_ridge)
    print("MAE (Ridge):", mae_ridge)
    print("R-squared (Ridge):", r_squared_ridge)

if __name__ == "__main__":
    main()

Without Ridge Regularization:
Theta0: 2.271343242627011
Theta1: 0.8187310482591091
RMSE: 0.9019119129891943
MAE: 0.6544927225191323
R-squared: 0.7647113829075756

With Ridge Regularization:
Theta0 (Ridge): 2.1394200497072826
Theta1 (Ridge): 0.8528282135722317
RMSE (Ridge): 0.9149515713767498
MAE (Ridge): 0.6635197566033988
R-squared (Ridge): 0.8324178272539956
