In [424]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer 
from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from tpot import TPOTClassifier

from scipy.io import arff

In [425]:
# Load the data set
raisins = arff.loadarff('./Raisin_dataset/Raisin_Dataset.arff')
raisins = pd.DataFrame(raisins[0])

In [426]:
#Split into X, y
X = raisins.drop('Class', axis=1)
y = raisins['Class'].apply(lambda x: x.decode('utf8')) #Gridsearchcv wouldnt work with arff encoded labels

In [427]:
# Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [428]:
# Create an SVC model
svm = SVC()

# Dictionary of parameters for GridSearchCV
parameters = {
  'kernel':['linear', 'rbf', 'sigmoid'],
  'C': [1, 10, 100]
}

In [None]:
# Create a GridSearchCV model
grid = GridSearchCV(svm, parameters)

# Fit the GridSearchCV model to the training data
grid.fit(X_train, y_train)

# Print the model and hyperparameters obtained by GridSearchCV
print(grid.best_estimator_)

In [431]:
# Print a table summarizing the results of GridSearchCV
df = pd.concat([pd.DataFrame(grid.cv_results_['params']), pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Score'])], axis=1)

print(df)

cv_table = df.pivot(index='kernel', columns='C')
 
print(cv_table)

     C   kernel     Score
0    1   linear  0.857778
1    1      rbf  0.826667
2    1  sigmoid  0.220741
3   10   linear  0.850370
4   10      rbf  0.837037
5   10  sigmoid  0.223704
6  100   linear  0.859259
7  100      rbf  0.848889
8  100  sigmoid  0.223704
            Score                    
C             1         10        100
kernel                               
linear   0.857778  0.850370  0.859259
rbf      0.826667  0.837037  0.848889
sigmoid  0.220741  0.223704  0.223704


In [433]:
#check on test data
print(grid.score(X_test, y_test))

0.8888888888888888


## Baysean optimisation

In [434]:
#create a dictionary to specify prior distributions for each hyperparameter.
search_spaces = {'kernel': Categorical(['linear', 'rbf', 'sigmoid']), 'C': Real(1, 100, prior='uniform')}


#create BayesSearchCV model
bayes = BayesSearchCV(svm, search_spaces, n_iter=10)

#fit the model on the training data
bayes.fit(X_train, y_train)

print(bayes.best_estimator_)

In [437]:
#get score on test data
print(bayes.score(X_test, y_test))

0.8711111111111111


## Tpot

In [438]:
#create a TPOT classifier
tpot = TPOTClassifier(generations=2, population_size=20)

In [439]:
#fit tpot to training data
tpot.fit(X_train, y_train)

In [443]:
#calculate accuracy score of tpot classifier

#score method not working with these labels so do it the long way
print(tpot.predict(X_test))

['Kecimen' 'Besni' 'Besni' 'Kecimen' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen'
 'Besni' 'Kecimen' 'Kecimen' 'Besni' 'Besni' 'Besni' 'Besni' 'Kecimen'
 'Besni' 'Kecimen' 'Kecimen' 'Besni' 'Besni' 'Besni' 'Kecimen' 'Besni'
 'Kecimen' 'Besni' 'Kecimen' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen'
 'Kecimen' 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Besni' 'Kecimen'
 'Kecimen' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen' 'Besni' 'Besni' 'Besni'
 'Besni' 'Kecimen' 'Besni' 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen'
 'Kecimen' 'Kecimen' 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Besni'
 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen' 'Besni' 'Besni'
 'Kecimen' 'Besni' 'Kecimen' 'Besni' 'Kecimen' 'Kecimen' 'Besni' 'Kecimen'
 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen' 'Kecimen' 'Kecimen'
 'Besni' 'Besni' 'Besni' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen' 'Besni'
 'Kecimen' 'Besni' 'Kecimen' 'Kecimen' 'Besni' 'Besni' 'Kecimen' 'Kecimen'
 'Kecimen' 'Besni' 'Kecimen' 'Kecimen' 'Kecimen' 'K

In [454]:
#get pct accuracy from the amount of matching labels with y_test
count = 0
preds = tpot.predict(X_test)
for i in range(len(preds)):
    if preds[i] == y_test.to_list()[i]:
        count += 1

pct_correct = count/len(y_test)*100

print(pct_correct)

85.33333333333334


In [455]:
#export pipeline
tpot.export("tpot_pipeline.py")