In [4]:
%reload_ext autoreload
%autoreload 2
# Import Libraries
from lib.PreprocessingTK import *
import pandas
import numpy as np

In [5]:
# Read Data with Features
'''
	Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			+1.5 gives the age in years
'''

featureNames = [
    "Sex",
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
    "Rings"
]
data = pandas.read_csv("../../data/Abalone/abalone.data",
                       names=featureNames)
# Show original data frame
print("\nOriginal Data Frame")
data


Original Data Frame


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [6]:

# Convert nominal data to categorical using one-hot encoding
nominalFeatures = ["Sex"]
for nominalFeature in nominalFeatures:
    uniqueValues = np.unique(data[nominalFeature])
    convertNominal(data, nominalFeature, uniqueValues, inplace=True)
# Show updated data frame
print("\nData Frame after converting nominal values to categorical using one-hot encoding")
data


Data Frame after converting nominal values to categorical using one-hot encoding


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0b100,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,0b100,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0b001,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,0b100,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,0b010,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0b001,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,0b100,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,0b100,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0b001,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [7]:
# Partition data into folds
k = 5
proportions = (0.75, 0.25) # Train / Test proportions
print(f"\nPartition data into {k} folds with train, test, and (Optional) validation sets: Proportions are {str(proportions)})")
folds = partition(data, k, classificationColumnId=None, includeValidationSet=False, proportions=proportions)
for i in range(0, len(folds)):
    print(f"Fold {i}, testSize={len(folds[i][0])}, trainSize={len(folds[i][1])}")


Partition data into 5 folds with train, test, and (Optional) validation sets: Proportions are (0.75, 0.25))
Fold 0, testSize=627, trainSize=209
Fold 1, testSize=627, trainSize=209
Fold 2, testSize=627, trainSize=209
Fold 3, testSize=627, trainSize=209
Fold 4, testSize=624, trainSize=209


In [8]:
# Test our learner
className = "Rings"
foldEvaluations = []
for fold in folds:
    trainingSet = fold[0]
    testingSet = fold[1]
    # Make a prediction
    prediction = naivePredictor(trainingSet, testingSet, classificationColId=className, method="regression")
    predicted_scores = [prediction for x in range(0,len(testingSet))]
    # Compose a performance evaluation, based on multiple metrics
    mse = evaluateError(predicted_scores, testingSet[className], method="MSE")
    mae = evaluateError(predicted_scores, testingSet[className], method="MAE")
    r2 = evaluateError(predicted_scores, testingSet[className], method="R2")
    pearson = evaluateError(predicted_scores, testingSet[className], method="pearson")
    foldEvaluation = {
        'MSE' : mse,
        'MAE' : mae,
        'R2': r2,
        'Pearson': pearson
    }
    foldEvaluations.append(foldEvaluation)

print("\nLearning Performance Evaluation")
evalDf = pandas.DataFrame(foldEvaluations)
# evalDf.index.name = 'Fold'
evalDf = evalDf.rename_axis(index=None, columns='Fold')
evalDf.round(2)



Learning Performance Evaluation


Fold,MSE,MAE,R2,Pearson
0,11.27,2.5,-0.0,0.0
1,9.4,2.28,-0.0,-0.0
2,7.86,2.14,-0.0,0.0
3,9.05,2.29,-0.0,0.0
4,12.41,2.52,-0.01,-0.0
