In [1]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


F:\7_Practice\DLcourse


This file presents a procedure that trains a linear
regression model to predict the published relative performance (PRP)
of a CPU, using 8 features from the Computer Hardware Dataset.

Importing packages and loading data with pandas completes before
the data processing begins. 
Then comes data encoding, in which non-numerical
features are mapped into the non-negative integer space.
After that, all data are transfered into numpy.array and split into 
training set (70%) and testing set (30%) randomly.
A gradient descent method is developed to train a parameter vector 
that minimizes the MSE of the training set.
The latest 5 training errors are examed every 50 epoch, and learning
rate will be lowered when training error almost converges.
The trained model has a testing error at around 2643.
Besides, the algebraic solution for the regression task is also
calculated for comparison and its testing error is 2634.

In [25]:
# import packages
import matplotlib as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing as pre
from IPython.core.interactiveshell import InteractiveShell
from matplotlib.pyplot import plot
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'


In [26]:
# load data from the file

colNames = ['vendor', 'name', 'MYCT', 'MMIN',
            'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data = pd.read_csv("data/cpu_performance/machine.data",
                   delimiter=',', header=0, names=colNames)
data.head()


Unnamed: 0,vendor,name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [27]:
# use labelencoder to facilitate computing
X, y = data.iloc[:, 0:8], data.iloc[:, -2]

# encode the 1st and 2nd columns before transforming them into
# numpy array, or the dataframe will be transformed into an 'object',
# which will results in an error when calculating std

# X, y = data.iloc[:, 2:8], data.iloc[:, -2]
X.iloc[:, 0] = pre.LabelEncoder().fit_transform(data.iloc[:, 0])
X.iloc[:, 1] = pre.LabelEncoder().fit_transform(data.iloc[:, 1])
X, y = np.array(X), np.array(y)
# x1 = pre.LabelEncoder().fit_transform(data.iloc[:, 0])
# x2 = pre.LabelEncoder().fit_transform(data.iloc[:, 1])
# X = np.concatenate((x1,x2,X), axis=1)
X.dtype

dtype('int64')

In [23]:
# this cell presents a bad example!!!!!!!!!!!!!
# if you run this cell, the next one will raise an error...

X, y = data.iloc[:, 0:8], data.iloc[:, -2]
X, y = np.array(X), np.array(y)
X[:, 0] = pre.LabelEncoder().fit_transform(data.iloc[:, 0])
X[:, 1] = pre.LabelEncoder().fit_transform(data.iloc[:, 1])
X.dtype # type 'object'


dtype('O')

In [28]:
# split data into training and testing sets, and normalize all of them except yTest

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.3, random_state=10)
featMean, featStd = np.mean(XTrain, axis=0), np.std(XTrain, axis=0)
XTrain = (XTrain - featMean) / featStd
XTest = (XTest - featMean) / featStd

In [29]:
XTrain.shape
featMean.shape

(146, 8)

(8,)

In [30]:

# initialize parameter
theta = np.ones(XTrain.shape[1]+1)

# Concatenate X with a new dimension for bias
XTrain = np.concatenate((np.ones((XTrain.shape[0], 1)), XTrain), axis=1)
XTest = np.concatenate((np.ones((XTest.shape[0], 1)), XTest), axis=1)


In [31]:
XTrain[0,:]

array([ 1.        , -0.04491858, -1.40377137, -0.32089191, -0.23149947,
       -0.68421192, -0.4487996 , -0.28102187, -0.00932839])

In [32]:

# compute hypothesis and loss
initRate = 0.001
learningRate = initRate

# train a linear model using SGD
numEpoch = 1000
lossList = []
for epoch in range(numEpoch):
    # forward
    hyp = XTrain@theta
    MSE = 0.5/yTrain.shape[0]*sum((hyp-yTrain)**2)
    grad = (hyp-yTrain)@XTrain
    theta -= learningRate*grad
    if epoch % 50 == 0:
        print('Epoch', epoch, 'loss:', MSE)
        lossList.append(MSE)
        if len(lossList) > 5:
            currentLoss = np.array(lossList[-5:-1])
            if currentLoss.std()/currentLoss.mean() < 0.001:
                learningRate *= 0.9
                print('almost converged, lowering learning rate')
        if learningRate/initRate < 0.5:
            print('solution converged, exit training')
            break



Epoch 0 loss: 17493.51952520403
Epoch 50 loss: 1528.595097129908
Epoch 100 loss: 1527.5571636339162
Epoch 150 loss: 1527.5085781704904
Epoch 200 loss: 1527.504976701486
Epoch 250 loss: 1527.504701741263
almost converged, lowering learning rate
Epoch 300 loss: 1527.5046812194748
almost converged, lowering learning rate
Epoch 350 loss: 1527.5046792527944
almost converged, lowering learning rate
Epoch 400 loss: 1527.5046790157169
almost converged, lowering learning rate
Epoch 450 loss: 1527.5046789806172
almost converged, lowering learning rate
Epoch 500 loss: 1527.50467897437
almost converged, lowering learning rate
Epoch 550 loss: 1527.5046789730577
almost converged, lowering learning rate
solution converged, exit training


In [33]:
# calculate the testing error of theta and compare it with algebraic solution
hypTest = XTest@theta
MSETest = 0.5/yTest.shape[0]*sum((hypTest-yTest)**2)
print("test error using gradient descent: ", MSETest)

algSolu = np.linalg.inv((XTrain.transpose()@XTrain))@XTrain.transpose()@yTrain
algTest = XTest@algSolu
MSEalg = 0.5/yTest.shape[0]*sum((algTest-yTest)**2)
print("test error using linear algebra: ", MSEalg)

test error using gradient descent:  2633.9965636696174
test error using linear algebra:  2633.9970018546173
