In [21]:
import joblib

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from skimage.measure import moments_hu
from mahotas.features import haralick

import numpy as np
import cv2
import os

In [22]:
class Identifier:

    def __init__(self):
        self.features = None
        self.labels = None
        self.X_train = None  
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.cross_val = StratifiedKFold(n_splits=5)
        self.optimized_classifier = None

    @staticmethod
    def load_images_from_folder(folder):
        images = list()
        for filename in os.listdir(folder):
            img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                images.append(img)
        return images

    @staticmethod
    def extract_features(image):
        return np.r_[moments_hu(image), haralick(image).flatten()]

    def save_extracted_features(self, bboxes_path):
        positive_instance = self.load_images_from_folder(bboxes_path + 'nodules/')
        negative_instance = self.load_images_from_folder(bboxes_path + 'non-nodules/')

        positive_features = np.array(list(map(self.extract_features, positive_instance)))
        negative_features = np.array(list(map(self.extract_features, negative_instance)))

        features = np.r_[positive_features, negative_features]
        labels = np.r_[np.ones(len(positive_instance)), np.zeros(len(negative_instance))]

        np.save('features/features.npy', features)
        np.save('features/labels.npy', labels)

    def load_features(self):
        self.features = np.load('features/features.npy')
        self.labels = np.load('features/labels.npy')
        
    def split_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(self.features, 
                                                            self.labels, 
                                                            stratify=self.labels, 
                                                            test_size=0.2)
        self.X_train = X_train  
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def normalize(self):
        scaler = RobustScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_test = scaler.transform(self.X_test)
        
    def training(self, kernel, scoring='f1_weighted'):
        
        if kernel != 'linear':
            default_classifier = SVC(class_weight='balanced', 
                                     decision_function_shape='ovo', cache_size=4000)
            
            default_params = {'C': np.reciprocal(np.arange(1, 10).astype(np.float)), 
                              'kernel': [kernel], 'gamma': ['scale'],
                              'coef0': np.arange(0, 10, 0.1), 'degree': range(1, 10)}
        else:
            n = self.X_train.shape[0]
            default_classifier = SGDClassifier(loss='hinge',
                                               class_weight='balanced', 
                                               max_iter = np.ceil(10**6 / n),
                                               shuffle = True)
            
            default_params = {'alpha'    : 10.0**-np.arange(1,7),
                              'l1_ratio' : np.arange(0.00, 1.001, 0.001)}


        ran_search = GridSearchCV(default_classifier,
                                  param_grid=default_params,
                                  cv=self.cross_val, scoring=scoring,
                                  verbose=3, n_jobs=4)

        ran_search.fit(self.X_train, self.y_train)
        print('Best score: {}'.format(ran_search.best_score_))
        print('Best parameters: {}'.format(ran_search.best_params_))
        
        joblib.dump(ran_search.best_estimator_, 'classifiers/{}.plk'.format(kernel))
        
    def load_optimized_classifier(self, classifier_path):
        self.optimized_classifier = joblib.load(classifier_path)
        print('Parameters: {}'.format(self.optimized_classifier.get_params()))

In [23]:
IDENTIIER = Identifier()

In [24]:
# IDENTIIER.save_extracted_features('bbox_dataset/')

In [25]:
IDENTIIER.load_features()

In [26]:
IDENTIIER.split_dataset()

In [27]:
IDENTIIER.normalize()

In [30]:
IDENTIIER.training('poly')

Fitting 5 folds for each of 8100 candidates, totalling 40500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  40 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 2408 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 7528 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 14696 tasks      | elapsed:   26.4s
[Parallel(n_jobs=4)]: Done 23912 tasks      | elapsed:   40.1s
[Parallel(n_jobs=4)]: Done 35176 tasks      | elapsed:   57.6s


Best score: 0.7821123321123321
Best parameters: {'C': 0.5, 'coef0': 0.9, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}


[Parallel(n_jobs=4)]: Done 40493 out of 40500 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=4)]: Done 40500 out of 40500 | elapsed:  1.1min finished


In [29]:
IDENTIIER.load_optimized_classifier('classifiers/linear.plk')

Parameters: {'alpha': 0.001, 'average': False, 'class_weight': 'balanced', 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.732, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 24391.0, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
