In [7]:
%reload_ext autoreload
%autoreload 2
# Import Libraries
from lib.PreprocessingTK import *
import pandas
import numpy as np


In [8]:
# Read Data with Features
'''
   1. vendor name: 30
      (adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec,
       dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson,
       microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry,
       sratus, wang)
   2. Model Name: many unique symbols
   3. MYCT: machine cycle time in nanoseconds (integer)
   4. MMIN: minimum main memory in kilobytes (integer)
   5. MMAX: maximum main memory in kilobytes (integer)
   6. CACH: cache memory in kilobytes (integer)
   7. CHMIN: minimum channels in units (integer)
   8. CHMAX: maximum channels in units (integer)
   9. PRP: published relative performance (integer)
  10. ERP: estimated relative performance from the original article (integer)
  '''

featureNames = [
    "VendorName",
    "ModelName",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP",
    "ERP"
]

data = pandas.read_csv("../../data/Machine/machine.data",
                       names=featureNames)

# Show original data frame
print("\nOriginal Data Frame")
data


Original Data Frame


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


In [9]:
# Convert nominal data to categorical using one-hot encoding
# Convert nominal data to categorical using one-hot encoding
nominalFeatures = ["VendorName", "ModelName"]
for nominalFeature in nominalFeatures:
    uniqueValues = np.unique(data[nominalFeature])
    convertNominal(data, nominalFeature, uniqueValues, inplace=True)
# Show updated data frame
print("Data Frame after converting nominal values to categorical using one-hot encoding")
data

Data Frame after converting nominal values to categorical using one-hot encoding


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,0b000000000000000000000000000001,0b00000000000000000000000000000000000000000000...,125,256,6000,256,16,128,198,199
1,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,269,253
2,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,220,253
3,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,32000,32,8,32,172,253
4,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,0b001000000000000000000000000000,0b00000000000000000000000000000000000000000000...,124,1000,8000,0,1,8,42,37
205,0b001000000000000000000000000000,0b00000000000000000000000000000000000000000000...,98,1000,8000,32,2,8,46,50
206,0b010000000000000000000000000000,0b00000000000000000000000000000000000000000000...,125,2000,8000,0,2,14,52,41
207,0b100000000000000000000000000000,0b01000000000000000000000000000000000000000000...,480,512,8000,32,0,0,67,47


In [10]:
# For demonstration purposes: Discretize PRP column into 10 bins based on frequency
print("\nFor demonstration purposes: Discretize PRP column into 10 bins based on frequency")
discretize(data, "PRP", xargs={"dMethod": "frequency", "bins": 10}, inplace=True)
# Show updated data frame
print("Data Frame after discretizing the PRP field into 10 bins of equal frequency (Showing PRP column.)")
data


For demonstration purposes: Discretize PRP column into 10 bins based on frequency
Data Frame after discretizing the PRP field into 10 bins of equal frequency (Showing PRP column.)


Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
99,0b000000000000100000000000000000,0b00000000000000000000000000000000000000000000...,480,96,512,0,1,1,6,15
181,0b000100000000000000000000000000,0b00000000000000000000000000000000000000000000...,240,512,1000,8,1,3,6,19
102,0b000000000000100000000000000000,0b00000000000000000000000000000000000000000000...,1100,512,1500,0,1,1,6,18
157,0b000000010000000000000000000000,0b00000000000000000100000000000000000000000000...,112,1000,1000,0,1,4,6,19
14,0b000000000000000000000000010000,0b00000000000000000000000000000000000000000000...,350,64,64,0,1,4,6,15
...,...,...,...,...,...,...,...,...,...,...
156,0b000000001000000000000000000000,0b00000000000000000000000000000000000000000000...,30,16000,32000,256,16,24,274,603
8,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,23,16000,64000,64,16,32,274,749
198,0b001000000000000000000000000000,0b00000000000000000000000000000000000000000000...,30,8000,64000,96,12,176,274,919
9,0b000000000000000000000000000010,0b00000000000000000000000000000000000000000000...,23,32000,64000,128,32,64,274,1238


In [11]:
# Partition data into folds
k = 5
proportions = (0.75, 0.25) # Train / Test proportions
print(f"\nPartition data into {k} folds with train, test, and (Optional) validation sets: Proportions are {str(proportions)})")
folds = partition(data, k, classificationColumnId=None, includeValidationSet=False, proportions=proportions)
for i in range(0, len(folds)):
    print(f"Fold {i}, testSize={len(folds[i][0])}, trainSize={len(folds[i][1])}")



Partition data into 5 folds with train, test, and (Optional) validation sets: Proportions are (0.75, 0.25))
Fold 0, testSize=31, trainSize=11
Fold 1, testSize=31, trainSize=11
Fold 2, testSize=31, trainSize=11
Fold 3, testSize=31, trainSize=11
Fold 4, testSize=30, trainSize=11


In [12]:
# Test our learner
className = "ERP"
foldEvaluations = []
for fold in folds:
    trainingSet = fold[0]
    testingSet = fold[1]
    prediction = naivePredictor(trainingSet, testingSet, classificationColId="ERP", method="regression")
    predicted_scores = [prediction for x in range(0,len(testingSet))]
    mse = evaluateError(predicted_scores, testingSet[className], method="MSE")
    mae = evaluateError(predicted_scores, testingSet[className], method="MAE")
    r2 = evaluateError(predicted_scores, testingSet[className], method="R2")
    pearson = evaluateError(predicted_scores, testingSet[className], method="pearson")
    foldEvaluation = {
        'MSE' : mse,
        'MAE' : mae,
        'R2': r2,
        'Pearson': pearson
    }
    foldEvaluations.append(foldEvaluation)

print("\nLearning Performance Evaluation")
evalDf = pandas.DataFrame(foldEvaluations)
# evalDf.index.name = 'Fold'
evalDf = evalDf.rename_axis(index=None, columns='Fold')
evalDf.round(2)



Learning Performance Evaluation


Fold,MSE,MAE,R2,Pearson
0,6763.91,75.41,-0.43,-0.0
1,2331.04,46.5,-12.8,-0.0
2,22458.52,129.57,-0.02,0.0
3,7211.24,77.58,-1.91,-0.0
4,9257.92,70.55,-0.01,0.0
