In [27]:
%reload_ext autoreload
%autoreload 2

from lib.PreprocessingTK import *
import pandas
import numpy as np

In [28]:
data = pandas.read_csv("../data/Machine/machine.data",
                  names=["VendorName", "ModelName", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"])

# Show original data frame
print("Original Data Frame")
data.head()

Original Data Frame


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [29]:
# Convert nominal data to categorical using one-hot encoding
vendorNames = np.unique(data["VendorName"])
modelNames = np.unique(data["ModelName"])
convertNominal(data, "VendorName", vendorNames, inplace=True)
convertNominal(data, "ModelName", modelNames, inplace=True)

# Show updated data frame
print("Data Frame after converting nominal values to categorical using one-hot encoding")
data.head()

Data Frame after converting nominal values to categorical using one-hot encoding


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,0b000000000000000000000000000001,0b00000000000000000000000000000000000000000000...,125,256,6000,256,16,128,198,199
1,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,269,253
2,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,220,253
3,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,172,253
4,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,16000,32,8,16,132,132


In [30]:
# discretize(data, "PRP", xargs={"dMethod": "equal-width", "bins": 10}, inplace=True)
# zeroIndex = np.min(data["PRP"])
# oneIndex = np.max(data["PRP"])
# print("Equal-Width Discretized - PRP Bin 0: " + str(np.count_nonzero(data["PRP"] == zeroIndex)))
# print("Equal-Width Discretized - PRP Bin 1: " + str(np.count_nonzero(data["PRP"] == oneIndex)))

# Discretize PRP column into 10 bins based on frequency
discretize(data, "PRP", xargs={"dMethod": "frequency", "bins": 10}, inplace=True)

# Show updated data frame
print("Data Frame after discretizing the PRP field into 10 bins of equal frequency (Showing PRP column.)")
data.head()

Data Frame after discretizing the PRP field into 10 bins of equal frequency (Showing PRP column.)


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
99,0b000000000000100000000000000000,0b00000000000000000000000000000000000000000000...,480,96,512,0,1,1,6,15
181,0b000100000000000000000000000000,0b00000000000000000000000000000000000000000000...,240,512,1000,8,1,3,6,19
102,0b000000000000100000000000000000,0b00000000000000000000000000000000000000000000...,1100,512,1500,0,1,1,6,18
157,0b000000010000000000000000000000,0b00000000000000000100000000000000000000000000...,112,1000,1000,0,1,4,6,19
14,0b000000000000000000000000010000,0b00000000000000000000000000000000000000000000...,350,64,64,0,1,4,6,15


In [31]:
# Partition data into 5 folds with equally sized train and test sets (no validation set.)
folds = partition(data, 5, classificationColumnId=None, includeValidationSet=False, proportions=(0.5,0.5))
print("Partition data into 5 folds with equally sized train and test sets (no validation set.)")
for i in range(0, len(folds)):
    print(f"Fold {i}, testSize={len(folds[i][0])}, trainSize={len(folds[i][1])}")

# Demonstration: Partition data into 10 folds with train, test, and validation sets with ratio (0.75, 0.15, 0.1)
folds_demo = partition(data, 10, classificationColumnId=None, includeValidationSet=True, proportions=(0.75,0.15, 0.1))
print("Demonstration: Partition data into 10 folds with train, test, and validation sets with ratio (0.75, 0.15, 0.1)")
for i in range(0, len(folds_demo)):
    print(f"Fold {i}, trainSize={len(folds_demo[i][0])}, testSize={len(folds_demo[i][1])}, validationSize={len(folds_demo[i][2])}")



Partition data into 5 folds with equally sized train and test sets (no validation set.)
Fold 0, testSize=21, trainSize=21
Fold 1, testSize=21, trainSize=21
Fold 2, testSize=21, trainSize=21
Fold 3, testSize=21, trainSize=21
Fold 4, testSize=20, trainSize=21
Demonstration: Partition data into 10 folds with train, test, and validation sets with ratio (0.75, 0.15, 0.1)
Fold 0, trainSize=15, testSize=3, validationSize=3
Fold 1, trainSize=15, testSize=3, validationSize=3
Fold 2, trainSize=15, testSize=3, validationSize=3
Fold 3, trainSize=15, testSize=3, validationSize=3
Fold 4, trainSize=15, testSize=3, validationSize=3
Fold 5, trainSize=15, testSize=3, validationSize=3
Fold 6, trainSize=15, testSize=3, validationSize=3
Fold 7, trainSize=15, testSize=3, validationSize=3
Fold 8, trainSize=15, testSize=3, validationSize=3
Fold 9, trainSize=15, testSize=3, validationSize=2


In [33]:
# Predict values within each fold, and compute error statistics:
# Mean Squared Error, Mean Absolute Error, Coefficient of Determination, and Pearson Correlation Coefficient
yCol = "ERP"

evalRows = []
for fold in folds:
    trainingSet = fold[0]
    testingSet = fold[1]
    prediction = naivePredictor(trainingSet, testingSet, predictorColId="ERP", method="regression")
    predicted_scores = [prediction for x in range(0,len(testingSet))]
    mse = evaluateError(predicted_scores, testingSet["ERP"], method="MSE")
    mae = evaluateError(predicted_scores, testingSet["ERP"], method="MAE")
    r2 = evaluateError(predicted_scores, testingSet["ERP"], method="R2")
    pearson = evaluateError(predicted_scores, testingSet["ERP"], method="pearson")

    evalRow = {
        'MSE' : mse,
        'MAE' : mae,
        'R2': r2,
        'Pearson': pearson
    }
    evalRows.append(evalRow)

evalDf = pandas.DataFrame(evalRows)
evalDf.round(2).head()



Unnamed: 0,MSE,MAE,R2,Pearson
0,53.61,5.67,-0.64,-0.0
1,230.02,10.14,-0.21,-0.0
2,287.27,11.87,-0.15,0.0
3,3479.6,49.53,-1.87,-0.0
4,152266.81,277.48,-0.84,0.0
