# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import time as time
import warnings
warnings.filterwarnings("ignore")


# Creating the dataframe for execution

In [2]:
def create_dataset(filename):
    data = pd.read_csv(filename+".csv", header=None)
    #naming the columns
    names = ['x'+str(col+1) for col in data.columns[:-1]] + ['y']
    data.columns = names
    dataset = data.copy()
    #adding the x0 column to the dataset
    dataset['x0'] = 1
    dataset = dataset[['x0'] + list(dataset.columns[:-1])]
    return dataset

# HoldOut Method

In [3]:
def holdout_splitTrainTest(df_data, division_ratio = 0.7):
    '''splitting the data into training and testing set'''
    df_data = df_data.sample(frac = 1).reset_index(drop=True)
    training_number = int(division_ratio * len(df_data))
    train_x = df_data[:training_number][df_data.columns[:-1]]
    train_y = df_data[:training_number][df_data.columns[-1]]

    test_x = df_data[training_number:][df_data.columns[:-1]]
    test_y = df_data[training_number:][df_data.columns[-1]]

    train_y = np.matrix(train_y).reshape(-1,1)
    test_y = np.matrix(test_y).reshape(-1,1)

    return train_x, train_y, test_x, test_y
    

# k- fold data split

In [4]:
def kfold_splitTrainTest(df_data, folds = 5):
    '''splitting the data into training and testing set'''
    df_data = df_data.sample(frac = 1).reset_index(drop=True)
    fold_size = int(len(df_data)/folds)
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    for i in range(folds):
        if i == 0:
            train_x = df_data[fold_size:][df_data.columns[:-1]]
            train_y = df_data[fold_size:][df_data.columns[-1]]

            test_x = df_data[:fold_size][df_data.columns[:-1]]
            test_y = df_data[:fold_size][df_data.columns[-1]]
        else:
            train_x = pd.concat([train_x, df_data[i*fold_size:][:fold_size][df_data.columns[:-1]]])
            train_y = pd.concat([train_y, df_data[i*fold_size:][:fold_size][df_data.columns[-1]]])

            test_x = pd.concat([test_x, df_data[i*fold_size:][:fold_size][df_data.columns[:-1]]])
            test_y = pd.concat([test_y, df_data[i*fold_size:][:fold_size][df_data.columns[-1]]])

    train_y = np.matrix(train_y).reshape(-1,1)
    test_y = np.matrix(test_y).reshape(-1,1)

    return train_x, train_y, test_x, test_y

# Normalizing the data

In [5]:
def normalise(train_x, test_x):
    '''normalising the data'''
    for col in train_x.columns[1:]:
        minimum = train_x[col].min()
        diff = train_x[col].max() - minimum

        train_x[col] = (train_x[col] - minimum) / diff 
        test_x[col] = (test_x[col] - minimum) / diff

    return train_x, test_x

# Least Square Method and RMSE

In [6]:
def Least_Square(x, y, w):
    '''calculating the least square error --> Cost Function'''
    # y = np.matrix(y)
    y_pred = np.matmul(x,w)
    error = np.sum(np.square(y_pred - y))/2
    return error

def rmse(y, ypred):
    '''calculating the mean squared error'''
    return np.sqrt(np.mean(np.square(y - ypred))/len(y))

# Gradient Descent Algorithm

In [7]:
def batch_gradient_descent(train_x, train_y, alpha, epsilon):
    '''batch gradient descent'''
    w_init = np.zeros(train_x.shape[1]).reshape(-1,1)
    train_x = np.matrix(train_x)
    w_h = []
    cost_list = []
    w_old = w_init + 1

    while np.linalg.norm(w_old - w_init) > epsilon:
        w_old = w_init
        w_init = w_init - alpha * np.matmul(train_x.T, np.matmul(train_x, w_init) - train_y)
        w_h.append(w_init)
        cost_list.append(Least_Square(train_x, train_y, w_init))

    return w_init, w_h, cost_list

def stochastic_gradient_descent(train_x, train_y, alpha, epsilon):
    '''stochastic gradient descent'''
    w_init = np.zeros(train_x.shape[1]).reshape(-1,1)
    train_x = np.matrix(train_x)
    w_h = []
    cost_list = []
    w_old = w_init + 1

    while np.linalg.norm(w_old - w_init) > epsilon:
        w_old = w_init
        for i in range(len(train_x)):
            w_init = w_init - alpha * np.matmul(train_x[i].T, np.matmul(train_x[i], w_init) - train_y[i])
            w_h.append(w_init)
            cost_list.append(Least_Square(train_x, train_y, w_init))

    return w_init, w_h, cost_list



# 1. Analyze Data 4 using linear regression

(a) Describe the experimental procedures used.

(b) Report the values of the parameters of the model.

(c) Analyze the performance of the model

(a) Describe the experimental procedures used.


In [8]:
'''Experimental procedure : 
'''

'Experimental procedure : \n'

In [9]:
filename = "data4"
dataset4 = create_dataset(filename)

'''Visualize the dataset-4'''
print(f"number of rows in dataset4 is : {dataset4.shape[0]} \nnumber of columns in dataset4 is : {dataset4.shape[1]}")
dataset4

number of rows in dataset4 is : 100 
number of columns in dataset4 is : 202


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x192,x193,x194,x195,x196,x197,x198,x199,x200,y
0,1,-0.003779,0.031027,-0.388220,1.420800,-0.78641,0.984240,0.75169,-1.158100,-0.55794,...,-0.62419,-0.84155,-0.797580,1.87690,-2.00010,-0.19268,1.638600,-0.590150,-0.42054,-839.430
1,1,-0.036043,-1.159100,0.219710,-0.952310,-0.50804,0.067726,1.12300,1.269000,2.18280,...,0.77773,-1.75710,1.144500,1.01170,-0.31249,0.40022,-1.059200,-0.454850,-0.34847,62.063
2,1,-0.438100,-0.733720,-0.089075,1.220100,-1.40850,0.971640,1.13360,-0.406290,-0.64220,...,-1.12920,-0.69191,-1.894400,-0.42760,0.33931,-0.72165,0.226230,0.268480,0.24492,115.990
3,1,-0.572800,-0.251010,-1.052400,0.377940,1.46930,-0.099663,2.48980,-0.041293,0.99387,...,0.24356,-1.07830,-1.690300,0.79401,0.32061,-0.35921,0.333220,1.396900,0.86721,101.340
4,1,-0.129100,1.506500,0.532350,-0.760570,-1.67030,-0.714710,1.87340,-0.842890,0.33552,...,-0.66104,0.94924,1.885600,-1.78200,0.76805,1.90530,1.018900,-0.854000,-0.17766,171.260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,-1.213600,2.387900,-1.217300,0.535470,-1.48810,-0.909300,-0.28999,-0.248500,0.49530,...,-0.24503,-0.32660,0.018761,0.22156,-0.65720,-0.39293,0.078999,0.762460,-1.55510,-533.230
96,1,-0.546690,-0.756010,-1.358100,-1.363200,-0.86046,0.418790,0.56376,-0.309220,-0.95458,...,-0.28748,0.33814,-1.194600,-0.13763,0.95873,0.11038,-0.008607,0.248750,0.12928,-415.730
97,1,2.286900,1.505600,-1.845500,1.927000,-0.62383,-0.412170,-0.41538,1.095500,0.30687,...,-0.59265,-0.87901,-0.457040,0.17610,-1.75290,3.12710,-1.419700,0.073631,-1.25840,604.320
98,1,0.818020,1.714200,0.507840,0.010488,-0.42719,0.455540,1.51450,0.540690,-2.12760,...,-1.04040,-1.05690,-0.510540,-0.39106,-0.36509,-0.45803,-0.270980,-0.819550,-0.55253,148.450


In [10]:
def normalize_data(dataset, mode='train', train_max=None, train_min=None):
    '''normalizing the train and test dataset'''
    data = dataset.copy()
    if mode=='train':
        train_max={}
        train_min={}
        for col in data.columns[1:]:
            train_max[col] = data[col].max()
            train_min[col] = data[col].min()
            data[col] = (data[col] - train_min[col]) / (train_max[col] - train_min[col])
        return data, train_min, train_max
    
    elif mode == 'test':
        if train_min is None or train_max is None:
            raise Exception('Pass train_min and/or train_max.')
        for col in data.columns[1:]:
            data[col] = (data[col] - train_min[col]) / (train_max[col] - train_min[col])
        return data
    


def get_rmse(pred, y):
    '''
    Calculate root mean squared error.
    '''
    return np.sqrt(np.mean(np.square(pred - y))/len(y))

(b) Report the values of the parameters of the model.

In [11]:
train_x = dataset4[dataset4.columns[:-1]]
train_y = dataset4[dataset4.columns[-1]]
train_x, min_value, max_value = normalize_data(train_x, mode='train')

'''using Lagarangian Multiplier Method to calculate w'''
x = np.matrix(train_x)
y = np.matrix(train_y)
y.shape
w = np.matmul(np.matmul(x.T, np.linalg.inv(np.matmul(x, x.T))), y.T)

print(f"value of the parameters w: \n{list(w.T)}")
print()


value of the parameters w: 
[matrix([[-305.81691139,  148.22884648, -153.37592053,   91.61977841,
           72.43180942,   38.44187618,  -27.39588838,   36.71607837,
          155.10699927,  -63.09663909, -116.28043984,  292.5565367 ,
          -11.50145664,   77.81301568,   22.63923564, -150.3214022 ,
          -46.32828118, -301.56975777, -233.52689723, -194.91055108,
          -68.14276161,  255.54372129, -363.20915147,  -17.71197863,
          238.13007525,  103.09952567, -113.31799063, -182.29296638,
         -184.7659447 ,  -24.92898833,   34.53320905,  171.55188699,
         -172.07481029,  -89.24585402,  278.04465385,  231.0016185 ,
          -68.06533621,  206.08088802,   90.75634987,   68.49660694,
          165.96628001,  110.90359992,  170.79951893,  -21.79217687,
            3.57031383,   49.36520974,   60.40462952,  -29.00043873,
           77.37185068,  147.58605497,   -3.36119338, -135.80938899,
         -153.84174332,  314.83251536,  -76.49524208,    6.51450491,
     

(c) Analyze the performance of the model

In [12]:
y_pred = np.matmul(x, w)
print(f"Performance of the model ---> RMSE : {get_rmse(y_pred, y)}")

Performance of the model ---> RMSE : 614.5892503702254


-------------------------------------------------------------

# 2. Analyze Computer hardware data set using linear regression (download data from UCI web repository)

(a) Analyze the data with normalization and without normalization.

(b) Describe how you applied normalization techniques on training and testing data.

(c) Apply random subsampling and k fold cross validation.

(d) Assess the performance of the model.

(e) Report the values of the hyperparameters and the parameters of the 
model.

(f) Apply batch as well as online optimization algorithms and compare 
their performance in terms of time and MSE


In [13]:
data = pd.read_csv(r'computer+hardware/machine.data', names=['vendor_name', 'model_name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP'])
data = data.drop(['vendor_name', 'model_name', 'ERP'], axis=1)

#saving the data to a csv file
data.to_csv('modified_Uci.csv', index=False, header=False)

filename = "modified_Uci"
dataset = create_dataset(filename)

#converting values of each column of the daataset to float
for col in dataset.columns[:-1]:
    dataset[col] = dataset[col].astype(float)  

dataset.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,y
0,1.0,125.0,256.0,6000.0,256.0,16.0,128.0,198
1,1.0,29.0,8000.0,32000.0,32.0,8.0,32.0,269
2,1.0,29.0,8000.0,32000.0,32.0,8.0,32.0,220
3,1.0,29.0,8000.0,32000.0,32.0,8.0,32.0,172
4,1.0,29.0,8000.0,16000.0,32.0,8.0,16.0,132


(a) Analyze the data with normalization and without normalization

In [14]:
#analyzing the data without normalization
train_x, train_y, test_x, test_y = holdout_splitTrainTest(dataset, 0.7)

#analyzing the data with normalization
train_x_normal, train_y_normal, test_x_normal, test_y_normal = holdout_splitTrainTest(dataset, 0.7)
train_x_normal, test_x_normal = normalise(train_x_normal, test_x_normal)

print(f"train data without normalization: \n{train_x}")
print("..................................................")
print(f"train data with normalization: \n{train_x_normal}")

train data without normalization: 
      x0     x1       x2       x3     x4    x5    x6
0    1.0  185.0   2000.0  16000.0   16.0   1.0   6.0
1    1.0  240.0    512.0   2000.0    8.0   1.0   5.0
2    1.0  140.0   2000.0  32000.0   32.0   1.0  54.0
3    1.0  133.0   1000.0  12000.0    9.0   3.0  12.0
4    1.0  400.0   2000.0   4000.0    0.0   1.0   1.0
..   ...    ...      ...      ...    ...   ...   ...
141  1.0  112.0   1000.0   1000.0    0.0   1.0   4.0
142  1.0   30.0  16000.0  32000.0  256.0  16.0  24.0
143  1.0   50.0   2000.0   8000.0    8.0   1.0   5.0
144  1.0   50.0   2000.0  16000.0    8.0   3.0   5.0
145  1.0   70.0   4000.0  12000.0    8.0   6.0   8.0

[146 rows x 7 columns]
..................................................
train data with normalization: 
      x0        x1        x2        x3        x4        x5        x6
0    1.0  0.028995  0.246988  0.249249  0.125000  0.019231  0.034091
1    1.0  0.078220  0.058735  0.186687  0.035156  0.057692  0.068182
2    1.0  0.527

(b) Describe how you applied normalization techniques on training and testing data.

(c) Apply random subsampling and k fold cross validation

In [15]:
def monteCarloCrossvalidation(k, dataset):
    '''monte carlo cross validation'''
    alpha_list = [0.01, 0.002, 0.03, 0.001, 0.674]
    rmse_list = []
    
    epsilon = 1e-6
    for i in range(k):
        rmse_list.clear()
        for alpha in alpha_list:  
            #performiong holdout split
            train_x, train_y, test_x, test_y = holdout_splitTrainTest(dataset, 0.7)
            
            #normalizing the train and test data
            train_x, test_x = normalise(train_x, test_x)

            #performing schoastic gradient descent on training data
            w, w_h, j_w = stochastic_gradient_descent(train_x, train_y, alpha, epsilon)

            #predicting the test data
            test_x = np.matrix(test_x)
            y_pred = np.matmul(test_x,w)
            rmse_error = rmse(test_y, y_pred)
            print(f"alpha : {alpha}, rmse_error : {rmse_error}")
            rmse_list.append(rmse_error)

        #plotting the rmse vs alpha graph
        plt.scatter(alpha_list, rmse_list, marker='x', color='r')
        plt.plot(alpha_list, rmse_list, color='b')
        plt.xlabel('alpha')
        plt.ylabel('rmse')
        plt.show()
        print("---------------------------------------------------")

    return 0

# k-fold cross validation
def kfold_cross_validation(k, dataset):
    '''k-fold cross validation'''
    alpha_list = [0.01, 0.002, 0.03, 0.001, 0.674]
    rmse_list = []
    # splitting the data into k folds   
    train_x, train_y, test_x, test_y = kfold_splitTrainTest(dataset, k)
    #normalizing the train and test data
    train_x, test_x = normalise(train_x, test_x)
    epsilon = 1e-6

    for alpha in alpha_list:  
        #performing schoastic gradient descent on training data
        w, w_h, j_w = stochastic_gradient_descent(train_x, train_y, alpha, epsilon)

        #predicting the test data
        test_x = np.matrix(test_x)
        y_pred = np.matmul(test_x,w)
        rmse_error = rmse(test_y, y_pred)
        print(f"alpha : {alpha}, rmse_error : {rmse_error}")
        rmse_list.append(rmse_error)

    #plotting the rmse vs alpha graph
    plt.scatter(alpha_list, rmse_list, marker='x', color='r')
    plt.plot(alpha_list, rmse_list, color='b')
    plt.xlabel('alpha')
    plt.ylabel('rmse')
    plt.show()
    print("---------------------------------------------------")

    return 0


In [16]:
# applying monte carlo cross validation on dataset
# monteCarloCrossvalidation(1, dataset)

In [17]:
# kfold_cross_validation(5, dataset)

(d) Assess the performance of the model.

In [18]:
w, w_h, j_w = stochastic_gradient_descent(train_x_normal, train_y_normal, 0.01, 1e-3)
y_pred = np.matmul(test_x_normal, w)
print(f"Performance of the model ---> RMSE : {get_rmse(y_pred, test_y_normal)}")

Performance of the model ---> RMSE : 9.125816244154345


(e) Report the values of the hyperparameters and the parameters of the model

In [19]:
print(f"The hyperparameters are : alpha = 0.01, epsilon = 1e-6")
print(f"The parameters are : \n{w.T}")

The hyperparameters are : alpha = 0.01, epsilon = 1e-6
The parameters are : 
[[-51.12118467  56.93876624 177.98500364 434.84994731 129.54877799
  -72.95888564 311.5070667 ]]


(f) Apply batch as well as online optimization algorithms and compare their perfor-
mance in terms of time and MSE.

In [20]:
w_b, w_b_h, j_w_b = batch_gradient_descent(train_x_normal, train_y_normal, 0.0001, 1e-9)
y_pred_b = np.matmul(test_x_normal, w_b)
print(f"Performance of the model ---> RMSE : {get_rmse(y_pred_b, test_y_normal)}")

Performance of the model ---> RMSE : 9.13308191126496
