## random forest

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics    # contains all the effectivness idexes 
import pandas as pd    # for the dataset
from matplotlib import pyplot as plt
from statistics import mean
import numpy as np

#### getting the dataset

In [2]:
labels = ['class', 'spec_number', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']

# note that the path is relative not absolute
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column

display(df)

Unnamed: 0,class,spec_number,eccentr,asp_ratio,elong,solidity,stoch_conv,iso_factor,max_ind_depth,lobedness,av_intensity,av_contr,smooth,third_mom,unif,entropy
0,3,4,0.70668,1.2510,0.38111,0.94226,0.99825,0.692500,0.019432,0.068724,0.031587,0.115020,0.013056,0.005311,0.000086,0.72247
1,27,4,0.72719,1.4779,0.32980,0.99388,1.00000,0.842300,0.002967,0.001602,0.026340,0.081903,0.006663,0.001785,0.000194,0.98050
2,13,6,0.62033,1.3105,0.26312,0.98535,1.00000,0.824780,0.004990,0.004531,0.070042,0.146550,0.021025,0.005802,0.000530,1.70500
3,9,4,0.58938,1.3232,0.40145,0.89009,0.98421,0.439770,0.047907,0.417700,0.114880,0.192110,0.035592,0.009184,0.001542,2.10040
4,15,1,0.41320,1.0384,0.48465,0.78118,0.87018,0.304780,0.080722,1.185900,0.047303,0.126190,0.015674,0.005899,0.000190,1.48900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,5,2,0.88075,1.7360,0.58345,0.83383,0.91754,0.415510,0.040582,0.299730,0.035786,0.100470,0.009993,0.002759,0.000258,1.09520
336,34,11,0.99871,19.0380,0.94834,0.85100,0.90702,0.086183,0.073048,0.971170,0.007817,0.048089,0.002307,0.000753,0.000013,0.34029
337,13,7,0.64536,1.3743,0.28162,0.98646,1.00000,0.844940,0.005427,0.005360,0.026853,0.085949,0.007333,0.002033,0.000128,0.94296
338,24,11,0.52828,1.2496,0.22380,0.98317,1.00000,0.824950,0.010372,0.019580,0.047482,0.102740,0.010444,0.001979,0.001198,1.34100


In [3]:
x = df.iloc[:, 1:16]
y = df.iloc[:, 0]

#### evaluation of the technique using cross validation

In [6]:
rf = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='sqrt')

In [8]:
# k-fold cross validation

k = 5

effect_cv = cross_validate(rf, x, y, cv=k, scoring=('accuracy', 'roc_auc_ovo', 'roc_auc_ovr'))

print(mean(effect_cv['test_accuracy']))
print(mean(effect_cv['test_roc_auc_ovo']))
print(mean(effect_cv['test_roc_auc_ovr']))

0.7852941176470588
0.9845689655172414
0.9844648494126106


In [10]:
# leave-one-out cross validation

effect_loocv = cross_validate(rf, x, y, cv=LeaveOneOut(), scoring='accuracy')

print(mean(effect_loocv['test_score']))

0.7705882352941177
