In [197]:
import numpy as np
import pandas as pd
import random
import os
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

# Task 1
### Question 1

In [74]:
X = pd.read_csv('../Project1/KV_periodic/X.csv')
Y = pd.read_csv('../Project1/KV_periodic/Y.csv')
print("The shape of X.csv is:")
print(X.shape)
print("The shape of Y.csv is:")
print(Y.shape)

The shape of X.csv is:
(28962, 1752)
The shape of Y.csv is:
(28962, 3)


<font color=blue size=3> Since X and Y both has a column for timestamp, therefore, the number of samples is 28962, the number of features is 1751, the number of types of targets is 2. </font>

In [75]:
chosenFeatureID = random.sample(range(1752), 10) #choose 10 random featrues from 1751 features
print("The chosen features are:")
#print(chosenFeatureID)
chosenX = X.iloc[:, chosenFeatureID].copy() #the chosen features
for col in chosenX.columns:
    print(col)

The chosen features are:
35_RxPacktes
4_cpu17_.sys
1_eth0_rxmcst.s
0_cpu23_.iowait
1_atmptf.s
5_cpu22_.sys
5_atmptf.s
3_kbmemfree
1_igmbq6.s
2_temp4_degC


<font color=blue size=3> By looking up the manual page:</font> <br>
<font color=blue size=3> 'iowait' means Percentage of time that the CPU or CPUs were idle during which the system had an outstanding disk I/O request.</font> <br>
<font color=blue size=3> 'soft' means percentage of time spent by the CPU or CPUs to service software interrupts.</font> <br>
<font color=blue size=3> 'tps' means the number of transfers per second that were issued to the device.</font> <br>
<font color=blue size=3> 'usr' means Percentage of CPU utilization that occurred while executing at the user level (application). </font> <br>
<font color=blue size=3> ... </font> <br>


In [76]:
pd.options.display.float_format = "{:,.2f}".format #set no more than two digits after the decimal point
allStatistics = pd.DataFrame(chosenX.describe(percentiles = [0.25,0.90]))
statistics  = allStatistics.iloc[[1,2,7,3,4,6], :].copy()
statistics

Unnamed: 0,35_RxPacktes,4_cpu17_.sys,1_eth0_rxmcst.s,0_cpu23_.iowait,1_atmptf.s,5_cpu22_.sys,5_atmptf.s,3_kbmemfree,1_igmbq6.s,2_temp4_degC
mean,1462.52,0.44,0.02,0.0,0.04,0.7,0.01,1298772.9,0.01,21.43
std,477.64,1.18,0.15,0.17,1.53,1.12,1.6,670189.95,0.09,0.49
max,4791.0,21.78,1.0,23.23,134.0,31.68,266.0,3251292.0,1.0,23.12
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,240796.0,0.0,19.5
25%,1138.0,0.0,0.0,0.0,0.0,0.0,0.0,719798.0,0.0,21.0
90%,2015.0,1.01,0.0,0.0,0.0,2.0,0.0,2141656.0,0.0,22.0


<font color=blue size=3> With help of describe(), the required statistics are shown above.</font> <br>

### Question 2

In [77]:
#statisticsNum = np.around(statisticsNum, decimals=2) #round to two digits after the decimal point
numpyX = X.drop(['TimeStamp'], axis=1).copy() #remove timestamp
numpyX = numpyX.to_numpy()
#np.set_printoptions(precision=2) #set no more than two digits after the decimal point
numpyX

array([[2.230000e+00, 5.500000e-01, 4.200000e-01, ..., 2.960000e+02,
        1.216590e+05, 2.937120e+05],
       [3.480000e+00, 7.600000e-01, 1.510000e+00, ..., 3.487000e+03,
        1.603670e+06, 2.822663e+06],
       [5.400000e+00, 7.900000e-01, 5.360000e+00, ..., 2.755000e+03,
        1.272053e+06, 2.442247e+06],
       ...,
       [4.000000e-02, 8.000000e-02, 0.000000e+00, ..., 3.210000e+02,
        1.565630e+05, 2.891050e+05],
       [4.000000e-02, 8.000000e-02, 0.000000e+00, ..., 3.100000e+02,
        1.479080e+05, 2.842120e+05],
       [4.000000e-02, 4.000000e-02, 0.000000e+00, ..., 3.330000e+02,
        1.724420e+05, 2.755180e+05]])

##### (a) L2 Normalization

In [78]:
normedRowX= preprocessing.normalize(numpyX, norm='l2')
normedRowX

array([[7.69676654e-09, 1.89830565e-09, 1.44961522e-09, ...,
        1.02163359e-06, 4.19901758e-04, 1.01373663e-03],
       [1.19873115e-08, 2.61791861e-09, 5.20139093e-09, ...,
        1.20114240e-05, 5.52404940e-03, 9.72302896e-03],
       [1.86083603e-08, 2.72233419e-09, 1.84705206e-08, ...,
        9.49370974e-06, 4.38348528e-03, 8.41596520e-03],
       ...,
       [1.38095228e-10, 2.76190456e-10, 0.00000000e+00, ...,
        1.10821421e-06, 5.40515080e-04, 9.98100523e-04],
       [1.38095536e-10, 2.76191072e-10, 0.00000000e+00, ...,
        1.07024041e-06, 5.10635864e-04, 9.81210213e-04],
       [1.38094431e-10, 1.38094431e-10, 0.00000000e+00, ...,
        1.14963614e-06, 5.95331995e-04, 9.51187534e-04]])

In [79]:
#Verifycation of L2 norm
normCorrect = 1
for i in range(normedRowX.shape[0]):
    sums = 0
    for j in normedRowX[i]:
        sums += j**2
    sums = float("{:.4f}".format(sums))     
    if sums != 1:
        normCorrect = 0
if normCorrect:
    print("L2 norm in row is correct.")

L2 norm in row is correct.


In [80]:
normedColumnX = preprocessing.normalize(numpyX.T, norm='l2')
normedColumnX = normedColumnX.T
normedColumnX

array([[0.0210194 , 0.00907754, 0.00944102, ..., 0.00046718, 0.00039578,
        0.00050488],
       [0.03280157, 0.0125435 , 0.03394272, ..., 0.00550362, 0.00521708,
        0.00485209],
       [0.050899  , 0.01303864, 0.12048541, ..., 0.00434829, 0.00413826,
        0.00419817],
       ...,
       [0.00037703, 0.00132037, 0.        , ..., 0.00050664, 0.00050933,
        0.00049697],
       [0.00037703, 0.00132037, 0.        , ..., 0.00048928, 0.00048118,
        0.00048855],
       [0.00037703, 0.00066018, 0.        , ..., 0.00052558, 0.00056099,
        0.00047361]])

In [81]:
#Verifycation of L2 norm
normCorrect = 1
for j in range(normedColumnX.shape[1]):
    sums = 0
    for i in range(normedColumnX.shape[0]):
        sums += normedColumnX[i][j]**2
    sums = float("{:.4f}".format(sums))   
    if sums != 1:
        normCorrect = 0
if normCorrect:
    print("L2 norm in column is correct.")

L2 norm in column is correct.


##### (b) Restriction to Interval

In [82]:
def normalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [83]:
resRowX = np.zeros(numpyX.shape)
for i in range(numpyX.shape[0]):
    resRowX[i] = normalizeData(numpyX[i])
resRowX

array([[5.84383254e-05, 5.84132563e-05, 5.84113164e-05, ...,
        6.28219869e-05, 1.87381125e-03, 4.44120111e-03],
       [6.58279802e-05, 6.57873925e-05, 6.57985839e-05, ...,
        1.17808967e-04, 2.39956946e-02, 4.21854738e-02],
       [9.93530577e-05, 9.92842697e-05, 9.93524608e-05, ...,
        1.40381108e-04, 1.90801620e-02, 3.65411665e-02],
       ...,
       [4.48981704e-07, 4.49579549e-07, 4.48383859e-07, ...,
        5.24609115e-06, 2.34045912e-03, 4.32144890e-03],
       [9.57149591e-07, 9.57747435e-07, 9.56551746e-07, ...,
        5.58984926e-06, 2.21160742e-03, 4.24882350e-03],
       [8.37580672e-07, 8.37580672e-07, 8.36982828e-07, ...,
        5.81404143e-06, 2.57817614e-03, 4.11876260e-03]])

In [84]:
#Verifycation of [0,1] scaling
resCorrect = 1
for i in range(resRowX.shape[0]):
    for j in resRowX[i]:   
        if j<0 or j>1:
            resCorrect = 0
if resCorrect:
    print("[0,1] scaling in row is correct.")

[0,1] scaling in row is correct.


In [85]:
resColumnX = np.zeros(numpyX.T.shape)
for i in range(numpyX.T.shape[0]):
    resColumnX[i] = normalizeData(numpyX.T[i])
resColumnX = resColumnX.T
resColumnX

array([[0.11506708, 0.18272425, 0.0320122 , ..., 0.00775097, 0.01653088,
        0.00782261],
       [0.17956656, 0.25249169, 0.11509146, ..., 0.40667583, 0.39170616,
        0.35135498],
       [0.27863777, 0.26245847, 0.40853659, ..., 0.3151644 , 0.30775637,
        0.29967932],
       ...,
       [0.00206398, 0.02657807, 0.        , ..., 0.01087636, 0.02536693,
        0.0071968 ],
       [0.00206398, 0.02657807, 0.        , ..., 0.00950119, 0.02317589,
        0.00653213],
       [0.00206398, 0.01328904, 0.        , ..., 0.01237655, 0.02938674,
        0.00535114]])

In [86]:
#Verifycation of [0,1] scaling
resCorrect = 1
for i in range(resRowX.shape[0]):
    for j in resRowX[i]:   
        if j<0 or j>1:
            resCorrect = 0
if resCorrect:
    print("[0,1] scaling in column is correct.")

[0,1] scaling in column is correct.


##### (c) Standardization

In [87]:
stdRowX = np.zeros(numpyX.shape)
for i in range(numpyX.shape[0]):
    stdRowX[i] = preprocessing.scale(numpyX[i])
stdRowX

array([[-0.14757565, -0.14757589, -0.14757591, ..., -0.14753276,
        -0.1298149 , -0.10469678],
       [-0.15495775, -0.15495815, -0.15495804, ..., -0.15444964,
         0.07895409,  0.25675721],
       [-0.1533002 , -0.15330087, -0.15330021, ..., -0.15289908,
         0.0322686 ,  0.2029788 ],
       ...,
       [-0.14485998, -0.14485997, -0.14485998, ..., -0.14481312,
        -0.12200608, -0.10265858],
       [-0.144821  , -0.144821  , -0.14482101, ..., -0.14477576,
        -0.12323057, -0.10333399],
       [-0.14486899, -0.14486899, -0.144869  , ..., -0.14482039,
        -0.1196973 , -0.1046511 ]])

In [88]:
#Verifycation of standardizatio
stdCorrect = 1
for i in stdRowX:
    mean = float("{:.4f}".format(i.mean()))
    std = float("{:.4f}".format(i.std()))
    if mean != 0 or std != 1:
        stdCorrect = 0
if stdCorrect:
    print("L2 standardizatio in row is correct.")

L2 standardizatio in row is correct.


In [89]:
stdColumnX = np.zeros(numpyX.T.shape)
for i in range(numpyX.T.shape[0]):
    stdColumnX[i] = preprocessing.scale(numpyX.T[i])
stdColumnX = stdColumnX.T
stdColumnX

array([[ 3.68670472,  1.83315127,  1.33082725, ..., -1.56242283,
        -1.55914158, -1.4338984 ],
       [ 6.15623665,  3.63558832,  5.82039329, ...,  0.14334655,
         0.05312454, -0.04070709],
       [ 9.9494377 ,  3.89307932, 21.6780348 , ..., -0.24794872,
        -0.30763854, -0.25027709],
       ...,
       [-0.63991522, -2.20087449, -0.39909728, ..., -1.54905893,
        -1.52116984, -1.43643639],
       [-0.63991522, -2.20087449, -0.39909728, ..., -1.55493905,
        -1.53058553, -1.43913192],
       [-0.63991522, -2.54419583, -0.39909728, ..., -1.54264425,
        -1.50389522, -1.44392142]])

In [90]:
#Verifycation of standardizatio
stdCorrect = 1
for i in stdColumnX.T:
    mean = float("{:.4f}".format(i.mean()))
    std = float("{:.4f}".format(i.std()))
    if mean != 0 or std != 1:
        stdCorrect = 0
if stdCorrect:
    print("L2 standardizatio in column is correct.")

L2 standardizatio in column is correct.


# Task 2
### Question 2.1

In [149]:
# prepare train data
numpyY = Y.drop(['TimeStamp'], axis=1).copy().to_numpy() #remove timestamp
trainX, testX, trainY, testY = train_test_split(numpyX, numpyY, train_size = 0.7, shuffle = True) #split the train and test data



In [175]:
def calError(real, pred):
    errors = np.zeros([1,2])
    m = pred.T.shape[1]
    for i in range(pred.T.shape[0]):
        average = real.T[i].mean()
        sums = 0
        for j in range(m):
            sums += abs(real.T[i][j] - pred.T[i][j])
        errors[0][i] = sums / m / average
    return errors

##### Linear regression

In [151]:
regLinear = LinearRegression()
regLinear.fit(trainX, trainY) #train model
predLinearY = regLinear.predict(testX) #predict Y

<font color=blue size=3> Linear regression's accuracy is:</font> <br>

In [176]:
calError(testY, predLinearY)

array([[2356849.12770209, 2687354.00372565]])

##### Random forest regression

In [180]:
regRF = RandomForestRegressor()
regRF.fit(trainX, trainY) #train model
predRFY = regRF.predict(testX) #predict Y



<font color=blue size=3> Random forest regression's accuracy is:</font> <br>

In [181]:
calError(testY, predRFY)

array([[0.02192736, 0.02354291]])

#####  Neural network regression

In [206]:
regNN = MLPRegressor(hidden_layer_sizes = [128, 64, 32], random_state=1, max_iter=500, activation = 'logistic', solver = 'adam', early_stopping = True)
regNN.fit(trainX, trainY)
predNNY = regNN.predict(testX) #predict Y

<font color=blue size=3> Neural network's accuracy is:</font> <br>

In [207]:
calError(testY, predNNY)

array([[0.04329211, 0.04552282]])

##### Naive method 

In [220]:
predNaiveY = np.zeros(testY.shape)
predNaiveY.T[0].fill(trainY.T[0].mean())
predNaiveY.T[1].fill(trainY.T[1].mean())
predNaiveY

array([[ 56.01466907, 112.039958  ],
       [ 56.01466907, 112.039958  ],
       [ 56.01466907, 112.039958  ],
       ...,
       [ 56.01466907, 112.039958  ],
       [ 56.01466907, 112.039958  ],
       [ 56.01466907, 112.039958  ]])

<font color=blue size=3> Naive method 's accuracy is:</font> <br>

In [222]:
calError(testY, predNaiveY)

array([[0.04329714, 0.04554263]])