In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import joblib
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import os
from collections import defaultdict
import cv2
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans





In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier


In [3]:
resize_size = (4608//10, 2592//10)

def load_images(directory):
    list_target_names = []
    list_images = []
        
    for path, subdirs, files in os.walk(directory):
        if(path.startswith(directory + '.')):
            continue
        files = [f for f in files if not f[0] == '.' and not f == 'desktop.ini'] # Ignore '.directory' file and desktop.ini
        print(path, len(files))
        for name in files:
            
            image = cv2.imread(os.path.join(path, name), cv2.IMREAD_REDUCED_GRAYSCALE_2)
            # print(path, name)
            if image is not None:
                image = cv2.resize(image, resize_size, interpolation=cv2.INTER_AREA)
                list_target_names.append(os.path.basename(path))
                list_images.append(image)
    
    return list_target_names,  list_images 

In [4]:
def load_data(directory):
    global d;
    Name=[]
    for file in os.listdir(directory):
        Name+=[file]
    
    #################################
    d = defaultdict(int)
    co = 0
    for x in sorted(os.listdir(directory)):
        if not x.startswith('.') and not d[x]:
            d[x] = co
            co+=1
    #########################
    target_names,images = load_images(directory)
    #########################

    target_names_shuffled, images_shuffled = shuffle(np.array(target_names), np.array(images))
    
    ############reshaping#############
    n_samples, nx, ny= images_shuffled.shape

    images_shuffled = images_shuffled.reshape(n_samples,-1)
        
    Xtrain, Xtest, ytrain, ytest = train_test_split(images_shuffled, target_names_shuffled, random_state=0, test_size=0.2)

    
    return Xtrain, Xtest, ytrain, ytest 

In [5]:
iris_df = load_iris()

In [6]:
base_directory = './Dataset_0-5/'
X_train, X_test, Y_train, Y_test = load_data(base_directory)



FileNotFoundError: [WinError 3] The system cannot find the path specified: './Dataset_0-5/'

## Training with default/fixed paramaters and naive pipeline

In [None]:
pipeline_svc = Pipeline([('scalar1', StandardScaler()),
                        ('pca1', PCA(n_components=2)),
                        ('svc_classifier', svm.SVC())])

pipeline_rf = Pipeline([('scalar1', StandardScaler()),
                        ('pca2', PCA(n_components=2)),
                        ('rf_classifier', RandomForestClassifier())])

pipeline_sgd = Pipeline([('scalar1', StandardScaler()),
                        ('pca3', PCA(n_components=2)),
                        ('sgd_classifier', SGDClassifier())])

pipeline_bayes = Pipeline([('scalar1', StandardScaler()),
                        ('pca4', PCA(n_components=2)),
                        ('bayes_classifier', GaussianNB())])

pipeline_KNN = Pipeline([('scalar1', StandardScaler()),
                        ('pca5', PCA(n_components=2)),
                        ('KNN_classifier', KNeighborsClassifier(n_neighbors= 5))])

pipeline_KMeans = Pipeline([('scalar1', StandardScaler()),
                        ('pca6', PCA(n_components=2)),
                        ('KMeans_classifier', KMeans(n_clusters = 6, random_state = 0, n_init='auto'))])

In [None]:
# list of pipelines
pipelines = [pipeline_svc, pipeline_rf, pipeline_sgd, pipeline_bayes, pipeline_KNN, pipeline_KMeans]
best_accuracy = 0.0
best_clf = -1  # index of best classifier
best_pipeline = ""
pipe_dict = {0: 'SVC',
             1: 'Random Forest',
             2: 'SGD',
             3: 'Bayes',
             4: 'KNN',
             5: 'KMeans'
             }


In [None]:
for p in pipelines:
    p.fit(X_train, Y_train)

In [None]:
for i,p in enumerate(pipelines):
    print("{} Test accuracy: {}".format(pipe_dict[i], p.score(X_test, Y_test)))

SVC Test accuracy: 0.19452054794520549
Random Forest Test accuracy: 0.2136986301369863
SGD Test accuracy: 0.14794520547945206
Bayes Test accuracy: 0.2054794520547945
KNN Test accuracy: 0.1726027397260274
KMeans Test accuracy: -3153550.816000039


In [None]:
for i,model in enumerate(pipelines):
    if model.score(X_test, Y_test) > best_accuracy:
        best_accuracy = model.score(X_test, Y_test)
        best_pipeline = model
        best_clf = i
        
print('Classifier with best accuracy: {}'.format(pipe_dict[best_clf]))

Classifier with best accuracy: Random Forest


## Grid Search and Hyperparameter Tuning

In [None]:
TUNING = False

In [None]:
if TUNING:
    clf_1 = svm.SVC()
    param_grid_1 = {'C': [0.1, 1],#, 10, 100, 1000],
                'gamma': [1, 0.1],# 0.01,  0.001,  0.0001],
                'kernel': ['rbf']}#, 'linear']}
    grid = GridSearchCV(clf_1, param_grid_1, refit=True, verbose=3)#, n_jobs= 1)

    # fitting the model for grid search
    grid.fit(X_train, Y_train)



In [None]:
if TUNING:

    clf_2 = RandomForestClassifier()

    param_grid_2 = {
        'n_estimators': [25, 50, 100],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [3, 6, 9],
        'max_leaf_nodes': [3, 6, 9],
    }
    grid = GridSearchCV(clf_2, param_grid_2, refit=True,
                        verbose=3)  # , n_jobs= 1)

    # fitting the model for grid search
    grid.fit(X_train, Y_train)


In [None]:
if TUNING:
        
    df = pd.DataFrame(grid.cv_results_)
    df.to_csv('svc.csv')
    print(grid.best_estimator_)
    print(grid.best_params_)

    df = pd.DataFrame(grid.cv_results_)
    df.to_csv('random_forest.csv')
    print(grid.best_estimator_)
    print(grid.best_params_)