In [73]:
import joblib
from datetime import datetime as dt

from sklearn.model_selection import cross_validate

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from skimage.measure import moments_hu
from mahotas.features import haralick

import numpy as np
import cv2
import os

In [74]:
class Identifier:

    def __init__(self):
        self.features = None
        self.labels = None
        self.X_train = None  
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.cross_val = StratifiedKFold(n_splits=5)
        self.optimized_classifier = None

    @staticmethod
    def load_images_from_folder(folder):
        images = list()
        for filename in os.listdir(folder):
            img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                images.append(img)
        return images

    @staticmethod
    def extract_features(image):
        return np.r_[moments_hu(image), haralick(image).flatten()]

    def save_extracted_features(self, bboxes_path):
        positive_instance = self.load_images_from_folder(bboxes_path + 'nodules/')
        negative_instance = self.load_images_from_folder(bboxes_path + 'non-nodules/')

        positive_features = np.array(list(map(self.extract_features, positive_instance)))
        negative_features = np.array(list(map(self.extract_features, negative_instance)))

        features = np.r_[positive_features, negative_features]
        labels = np.r_[np.ones(len(positive_instance)), np.zeros(len(negative_instance))]

        np.save('features/features.npy', features)
        np.save('features/labels.npy', labels)

    def load_features(self):
        self.features = np.load('features/features.npy')
        self.labels = np.load('features/labels.npy')
        
    def split_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(self.features, 
                                                            self.labels, 
                                                            stratify=self.labels, 
                                                            test_size=0.2)
        self.X_train = X_train  
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def normalize(self):
        scaler = RobustScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_test = scaler.transform(self.X_test)
        
    def training(self, kernel, scoring='f1_weighted'):
        
        if kernel != 'linear':
            default_classifier = SVC(class_weight='balanced', 
                                     decision_function_shape='ovo', cache_size=4000)
            
            default_params = {'C': np.reciprocal(np.arange(1, 10).astype(np.float)), 
                              'kernel': [kernel], 'gamma': ['scale'],
                              'coef0': np.arange(0, 10, 0.1), 'degree': range(1, 10)}
        else:
            n = self.X_train.shape[0]
            default_classifier = SGDClassifier(loss='hinge',
                                               class_weight='balanced', 
                                               max_iter = np.ceil(10**6 / n),
                                               shuffle = True)
            
            default_params = {'alpha'    : 10.0**-np.arange(1,7),
                              'l1_ratio' : np.arange(0.00, 1.001, 0.001)}


        grid_search = GridSearchCV(default_classifier,
                                   param_grid=default_params,
                                   cv=self.cross_val, scoring=scoring,
                                   verbose=3, n_jobs=4)

        grid_search.fit(self.X_train, self.y_train)
        print('Best score: {}'.format(grid_search.best_score_))
        print('Best parameters: {}'.format(grid_search.best_params_))

        now = dt.now().strftime('%Y-%m-%d_%H:%M:%S')
        joblib.dump(grid_search.best_estimator_, 'classifiers/{}_{}.plk'.format(kernel, now))
        
    def load_optimized_classifier(self, classifier_path):
        self.optimized_classifier = joblib.load(classifier_path)
        print('Parameters: {}'.format(self.optimized_classifier.get_params()))

    def get_metrics(self, classifier):
        scores = cross_validate(classifier, self.X_train, self.y_train, 
                                cv=self.cross_val, n_jobs=-1,
                                scoring=['balanced_accuracy', 'f1_weighted'])
        return (scores['test_balanced_accuracy'].mean(), 
                scores['test_balanced_accuracy'].std(), 
                scores['test_f1_weighted'].mean(), 
                scores['test_f1_weighted'].std())

    def calculate_metrics(self):
        classifiers = [(clf, joblib.load('./classifiers/' + clf)) for clf in os.listdir('./classifiers') if '.plk' in clf]
        
        headers = ['file_name', 'day', 'hour', 'balanced_accuracy', 'std', 'f1_weighted', 'std']

        with open('scores/results.csv', 'w') as file:

            file.writelines(','.join(headers) + '\n')
            for name, classifier in classifiers:
                data = [str(metric) for metric in self.get_metrics(classifier)]
                name = name.replace('.plk', '')
                file.writelines(','.join(name.split('_')) + ',' + ','.join(data) + '\n')

In [75]:
IDENTIIER = Identifier()

In [76]:
# IDENTIIER.save_extracted_features('bbox_dataset/')

In [77]:
IDENTIIER.load_features()

In [78]:
IDENTIIER.split_dataset()

In [79]:
IDENTIIER.normalize()

In [86]:
IDENTIIER.calculate_metrics()

In [85]:
IDENTIIER.training('poly')

Fitting 5 folds for each of 8100 candidates, totalling 40500 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 2808 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done 7928 tasks      | elapsed:   18.0s
[Parallel(n_jobs=4)]: Done 15096 tasks      | elapsed:   34.7s
[Parallel(n_jobs=4)]: Done 24312 tasks      | elapsed:   56.2s
[Parallel(n_jobs=4)]: Done 35576 tasks      | elapsed:  1.4min
Best score: 0.782930402930403
Best parameters: {'C': 0.5, 'coef0': 1.8, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
[Parallel(n_jobs=4)]: Done 40500 out of 40500 | elapsed:  1.6min finished


In [None]:
IDENTIIER.load_optimized_classifier('classifiers/linear.plk')