# Load necessary libs

In [36]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection as ms
from sklearn import dummy, metrics, svm
from scipy.io import arff
import pandas as pd

import warnings
warnings.simplefilter("ignore")

%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Load datasets

In [37]:
#Full datasets
segmentChallenge = pd.DataFrame(arff.loadarff('segment-challenge.arff')[0])
segmentTest = pd.DataFrame(arff.loadarff('segment-test.arff')[0])
diabetes = pd.DataFrame(arff.loadarff('diabetes.arff')[0])
glass = pd.DataFrame(arff.loadarff('glass.arff')[0])

#Classes to classify
segmentChallengeClass = segmentChallenge['class'].astype('str')
segmentTestClass = segmentTest['class'].astype('str')
diabetesClass = diabetes['class'].astype('str')
glassClass = glass['Type'].astype('str')

#Rest data
segmentChallengeData = segmentChallenge.drop(columns = ['class'])
segmentTestData = segmentTest.drop(columns = ['class'])
diabetesData = diabetes.drop(columns = ['class'])
glassData = glass.drop(columns = ['Type'])

#Split data for train and test 
diabetesTrainData,diabetesTestData,diabetesTrainClass,diabetesTestClass = ms.train_test_split(diabetesData, diabetesClass, test_size = 0.3)
glassTrainData, glassTestData, glassTrainClass, glassTestClass  = ms.train_test_split(glassData,glassClass, test_size = 0.3)

# Set classifiers

In [38]:
#n_jobs = 4 is for two real computer kernels 
votingClassifiers = [('lr', LogisticRegressionCV(cv=10, n_jobs=4)),
                     ('rf', RandomForestClassifier(n_jobs=4, n_estimators=100, random_state=0)),
                     ('gs', GaussianNB())]

classifiers = {
    'Baseline'               : dummy.DummyClassifier(),
    'LogisticRegressionCV'   : LogisticRegressionCV(cv = 10, n_jobs = 4),
    'SVM'                    : svm.SVC(gamma='scale'),
    'DecisionTreeClassifier' : DecisionTreeClassifier(random_state = 0),
    'NaiveBayes'             : GaussianNB(),
    'RandomForestClassifier' : RandomForestClassifier(n_jobs = 4),
    'AdaBoostClassifier'     : AdaBoostClassifier(n_estimators = 50),
    'VotingClassifier'       : VotingClassifier(estimators = votingClassifiers, voting='soft')
}

# Set params for classifiers

In [39]:
params = {
    'Baseline': [{}],
    
    'LogisticRegressionCV': 
    [{ 
        'solver': ['newton-cg', 'lbfgs', 'sag']
    }],
    
    'SVM': 
    [{
        'max_iter': [100],
        'kernel'  : ['linear'],
        'C'       : [1, 2, 4, 16]
    }],
    
    'DecisionTreeClassifier': 
    [{  
        'criterion'        : ['gini', 'entropy'],
        'splitter'         : ['best', 'random'],
        'min_samples_split': [2, 4, 8],
        'min_samples_leaf' : [1, 2, 3]
    }],
    
    'NaiveBayes': 
    [{
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    }],
    
    'RandomForestClassifier': 
    [{
        'criterion'       :['entropy', 'gini'],
        'n_estimators'    :[10, 100],
        'min_samples_leaf':[2, 4],
        'bootstrap'       :[True, False],
        'verbose'         :[0, 1]
    }],
    
    'AdaBoostClassifier': 
    [{
        'n_estimators'    : [50, 100],
        'learning_rate'   : [1, 0.75],
        'algorithm'       : ['SAMME', 'SAMME.R'],
        'random_state'    : [0, 1]
    }],
    
    'VotingClassifier': 
    [{
        'voting': ['soft', 'hard'],
        'flatten_transform':[True, False]
    }]
}

# Start

In [54]:
scorer = metrics.make_scorer(metrics.f1_score)

#print('SEGMENT CHALLENGE')
#for classifierName, classifierFoo in Classifiers.items():
#        paramSet = params[classifierName]
#        grid = ms.GridSearchCV(classifierFoo, paramSet, refit=True, scoring=scorer, cv=ms.StratifiedKFold(n_splits=10))
#        result = grid.fit(segmentChallengeData, segmentChallengeClass)
#        predictions = result.predict(segmentTestData)
#        f1 = metrics.f1_score(segmentTestClass, predictions, average="weighted")
#        print("{} ---> {:.3f}".format(classifierName,f1));

print('SEGMENT CHALLENGE')
for classifierName, classifierFoo in Classifiers.items():
        paramSet = params[classifierName]
        grid = ms.GridSearchCV(classifierFoo, paramSet, refit=True, scoring=scorer, cv=ms.StratifiedKFold(n_splits=10))
        result = grid.fit(segmentChallengeData, segmentChallengeClass)
        predictions = result.predict(segmentTestData)
        f1 = metrics.f1_score(segmentTestClass, predictions, average="weighted")
        print("{} ---> {:.3f}".format(classifierName,f1));

#print('DIABETES')
#for classifierName, classifierFoo in Classifiers.items():
#        paramSet = params[classifierName]
#        grid = ms.GridSearchCV(classifierFoo, paramSet, refit=True, scoring=scorer, cv=ms.StratifiedKFold(n_splits=10))
#        result = grid.fit(diabetesTrainData, diabetesTrainClass)
#        predictions = result.predict(diabetesTestData)
#        f1 = metrics.f1_score(diabetesTestClass, predictions, average="weighted")
#        print("{} ---> {:.3f}".format(classifierName,f1)); 
#print('GLASS')
#for classifierName, classifierFoo in Classifiers.items():
#        paramSet = params[classifierName]
#        grid = ms.GridSearchCV(classifierFoo, paramSet, refit=True, scoring=scorer, cv=ms.StratifiedKFold(n_splits=10))
#        result = grid.fit(glassTrainData, glassTrainClass)
#        predictions = result.predict(glassTestData)
#        f1 = metrics.f1_score(glassTestClass, predictions, average="weighted")
#        print("{} ---> {:.3f}".format(classifierName,f1));

SEGMENT CHALLENGE


ValueError: Target is multiclass but average='binary'. Please choose another average setting.