In [13]:
%reload_ext autoreload
%autoreload 2
# Import Libraries
from lib.PreprocessingTK import *
import pandas
import numpy as np

In [14]:
# Read Data with Features
'''
   1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
   2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
   3. month - month of the year: "jan" to "dec"
   4. day - day of the week: "mon" to "sun"
   5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
   6. DMC - DMC index from the FWI system: 1.1 to 291.3
   7. DC - DC index from the FWI system: 7.9 to 860.6
   8. ISI - ISI index from the FWI system: 0.0 to 56.10
   9. temp - temperature in Celsius degrees: 2.2 to 33.30
   10. RH - relative humidity in %: 15.0 to 100
   11. wind - wind speed in km/h: 0.40 to 9.40
   12. rain - outside rain in mm/m2 : 0.0 to 6.4
   13. area - the burned area of the forest (in ha): 0.00 to 1090.84
   (this output variable is very skewed towards 0.0, thus it may make
    sense to model with the logarithm transform).
    '''

featureNames = [
    "X",
    "Y",
    "Month",
    "Day",
    "FFMC",
    "DMC",
    "DC",
    "ISI",
    "temp",
    "RH",
    "Wind",
    "Rain",
    "Area"
]

data = pandas.read_csv("../../data/ForestFires/forestfires.data",
                       names=featureNames,
                       skiprows=[0])
# Show original data frame
print("Original Data Frame")
data

Original Data Frame


Unnamed: 0,X,Y,Month,Day,FFMC,DMC,DC,ISI,temp,RH,Wind,Rain,Area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [15]:
# Convert nominal data to categorical using one-hot encoding
nominalFeatures = ["Day", "Month"]
for nominalFeature in nominalFeatures:
    uniqueValues = np.unique(data[nominalFeature])
    convertNominal(data, nominalFeature, uniqueValues, inplace=True)
# Show updated data frame
print("\nData Frame after converting nominal values to categorical using one-hot encoding")
data


Data Frame after converting nominal values to categorical using one-hot encoding


Unnamed: 0,X,Y,Month,Day,FFMC,DMC,DC,ISI,temp,RH,Wind,Rain,Area
0,7,5,0b000010000000,0b0000001,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,0b010000000000,0b0100000,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,0b010000000000,0b0000100,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,0b000010000000,0b0000001,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,0b000010000000,0b0001000,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,0b000000000010,0b0000100,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [16]:
# Suggestion: Apply a log transformation to the area column.
# Experiment: Compute skewedness metric to determine skewedness before/after log transformation
def computePearsonModeSkewedness(arr):
    arrMean = np.mean(arr)
    arrMedian = np.median(arr)
    arrStddev = np.std(arr)
    return ((arrMean - arrMedian) / arrStddev)

beforeSkewedness = computePearsonModeSkewedness(list(data["Area"]))
print(f"\nSkewedness before applying log transformation: {beforeSkewedness}")
# Apply the log transformation to the area column.
# data["Area"] = data["Area"].map(lambda x: x+2) # Add one to prevent math domain errors (e.g. log2(0))
# data["Area"] = data["Area"].map(lambda x: math.log(x,2))
afterSkewedness = computePearsonModeSkewedness(list(data["Area"]))
print(f"Skewedness after applying log transformation: {afterSkewedness}")

# Show updated data frame
print("Data Frame after applying a logarithm transformation")
data


Skewedness before applying log transformation: 0.19384294531649837
Skewedness after applying log transformation: 0.19384294531649837
Data Frame after applying a logarithm transformation


Unnamed: 0,X,Y,Month,Day,FFMC,DMC,DC,ISI,temp,RH,Wind,Rain,Area
0,7,5,0b000010000000,0b0000001,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,0b010000000000,0b0100000,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,0b010000000000,0b0000100,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,0b000010000000,0b0000001,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,0b000010000000,0b0001000,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,0b000000000010,0b0001000,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,0b000000000010,0b0000100,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [17]:
# Partition data into folds
k = 5
print(f"\nPartition data into {k} folds with train, test, and (Optional) validation sets.")
folds = partition(data, k, classificationColumnId=None)
for i in range(0, len(folds)):
    print(f"Fold {i}, size={len(folds[i])}")


Partition data into 5 folds with train, test, and (Optional) validation sets.
Fold 0, size=104
Fold 1, size=104
Fold 2, size=104
Fold 3, size=104
Fold 4, size=101


In [18]:
# Test our learner
className = "Area"
foldEvaluations = []
for i in range(0,k):
    testingSet = folds.pop(i)
    trainingSet = pandas.concat(folds, ignore_index=True)
    folds.insert(i, testingSet)
    # Make a prediction
    prediction = naivePredictor(trainingSet, testingSet, classificationColId=className, method="regression")
    predicted_scores = [prediction for x in range(0,len(testingSet))]
    # Compose a performance evaluation, based on multiple metrics
    mse = evaluateError(predicted_scores, testingSet[className], method="MSE")
    mae = evaluateError(predicted_scores, testingSet[className], method="MAE")
    r2 = evaluateError(predicted_scores, testingSet[className], method="R2")
    pearson = evaluateError(predicted_scores, testingSet[className], method="pearson")
    foldEvaluation = {
        'MSE' : mse,
        'MAE' : mae,
        'R2': r2,
        'Pearson': pearson
    }
    foldEvaluations.append(foldEvaluation)

print("\nLearning Performance Evaluation")
evalDf = pandas.DataFrame(foldEvaluations)
# evalDf.index.name = 'Fold'
evalDf = evalDf.rename_axis(index=None, columns='Fold')
evalDf.round(2)


Learning Performance Evaluation


Fold,MSE,MAE,R2,Pearson
0,12242.09,26.1,-0.01,0.0
1,291.23,13.08,-0.18,0.0
2,5647.38,20.77,-0.0,0.0
3,832.78,16.18,-0.04,-0.0
4,1216.44,16.94,-0.01,-0.0


In [None]:
import numpy as np
avgMSE = np.mean(evalDf["MSE"])

print(f"Average Mean Squared Error: {avgMSE}")