# Start with local import and frame customization

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
import sys
from IPython.display import HTML
sys.path.append('/home/rcendre/classification')

# Imports

In [None]:
import os
import sys
import itertools
import webbrowser
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from pandas.io.formats.style import Styler
from scipy.stats import randint as randint
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import davies_bouldin_score
from toolbox.classification.common import Folds, IO, Tools
from toolbox.classification.parameters import ORL, Settings
from toolbox.models.builtin import Applications
from toolbox.transforms.common import PredictorTransform
from toolbox.transforms.labels import OrderedEncoder
from toolbox.transforms.signals import DWTTransform, FilterTransform, ScaleTransform, RatioTransform, FittingTransform
from toolbox.views.common import Views, ViewsTools
from toolbox.views.signals import SignalsViews

# Parameters

In [None]:
# Advanced parameters
validation = 4
statistics = ORL.get_statistics_keys()
settings = Settings.get_default_orl()

# Inputs

In [None]:
wavelength = np.arange(start=440, stop=960, step=1)
inputs = ORL.get_spectra(wavelength)
inputs['Label'] = inputs['Pathological']
label_encoder = OrderedEncoder().fit(['Sain', 'Pathological'])

In [None]:
group_encoder = LabelEncoder().fit(inputs['Reference'])

In [None]:
Tools.transform(inputs, {'datum': 'Label'}, label_encoder, 'LabelEncode')
Tools.transform(inputs, {'datum': 'Reference'}, group_encoder, 'GroupEncode')
Folds.build_group_folds(inputs, {'datum': 'Datum', 'label_encode': 'LabelEncode', 'group': 'GroupEncode'}, validation)

# Statistics

In [None]:
ViewsTools.plot_size((20, 8))
Views.statistics(inputs, statistics);

# Mean Analysis

In [None]:
SignalsViews.mean_and_deviation(inputs, {'datum':'Datum', 'label':'Label', 'wavelength':'Wavelength'}, settings);

In [None]:
SignalsViews.mean_and_deviation(inputs[inputs['Practitioner']=='V1'], {'datum':'Datum', 'label':'Label', 'wavelength':'Wavelength'}, settings, 'V1');
SignalsViews.mean_and_deviation(inputs[inputs['Practitioner']=='V2'], {'datum':'Datum', 'label':'Label', 'wavelength':'Wavelength'}, settings, 'V2');

# Preprocessing

In [None]:
Tools.transform(inputs, {'datum': 'Datum'}, FilterTransform(5, 'avg'), 'Average')
Tools.transform(inputs, {'datum': 'Average'}, ScaleTransform('mean'), 'Scale')

# Preprocess - Mean Analysis

In [None]:
SignalsViews.mean_and_deviation(inputs, {'datum':'Scale', 'label':'Label', 'wavelength':'Wavelength'}, settings);

In [None]:
SignalsViews.mean_and_deviation(inputs[inputs['Practitioner']=='V1'], {'datum':'Scale', 'label':'Label', 'wavelength':'Wavelength'}, settings, 'V1');
SignalsViews.mean_and_deviation(inputs[inputs['Practitioner']=='V2'], {'datum':'Scale', 'label':'Label', 'wavelength':'Wavelength'}, settings, 'V2');

# Ratios Evaluation

## Model

In [None]:
model = SVC(kernel='linear', class_weight='balanced', probability=True)
grid = {'C': np.geomspace(0.01, 100, 5).tolist()}

## Litterature ratios
540 over 575 µm and 545 over 575 µm seems relevant

In [None]:
Tools.transform(inputs, {'datum': 'Scale'}, RatioTransform(ratios=[(540, 575), (545, 575)], wavelength=wavelength), 'LRatios')

In [None]:
Tools.evaluate(inputs, {'datum': 'LRatios', 'label_encode': 'LabelEncode'}, model, 'LRatios_SVM', grid=grid)

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'LRatios_SVM'}, label_encoder),
                                    title='Test - 540/575 and 545/575 Ratios classification performance'))

## Anova

In [None]:
SignalsViews.analysis(inputs,  {'datum': 'Datum', 'wavelength': 'Wavelength', 'label_encode': 'Label'}, mode='Anova');
SignalsViews.analysis_relation(inputs,  {'datum': 'Datum', 'wavelength': 'Wavelength', 'label_encode': 'Label'}, scale='log', mode='Anova');

## Evaluate

In [None]:
Tools.transform(inputs, {'datum': 'Scale'}, RatioTransform(ratios=[(490, 525), (490, 560), (490, 590), (600, 620)], wavelength=wavelength), 'ORatios')

In [None]:
Tools.evaluate(inputs, {'datum': 'ORatios', 'label_encode': 'LabelEncode'}, model, 'ORatios_SVM', grid=grid)

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'ORatios_SVM'}, label_encoder),
                                    title='Test - 490/(525,560,590) and 600/620 Ratios classification performance'))

In [None]:
Views.receiver_operator_curves(ViewsTools.data_as(inputs, 'ORatios_SVM'), label_encoder, {'label_encode': 'LabelEncode', 'eval': 'ORatios_SVM'}, settings);

# PCA Evaluation

## Quick Analysis

In [None]:
whole_pca = PCA().fit(np.array(inputs['Scale'].tolist()))
cumul = np.cumsum(np.round(whole_pca.explained_variance_ratio_, decimals=3)*100)
over_95 = np.argmax(cumul>95)
over_99 = np.argmax(cumul>99)
over_995 = np.argmax(cumul>99.5)
plt.plot(cumul)
plt.plot([0,over_95,over_95], [95,95,0])
plt.plot([0,over_99,over_99], [99,99,0])
plt.plot([0,over_995,over_995], [99.5,99.5,0])
plt.xlim(left=0)
plt.ylim(bottom=80)
plt.ylabel('% Variance Explained')
plt.xlabel('# of Features')
plt.title('PCA Analysis')

## Model

In [None]:
pca95 = Pipeline([('pca', PCA(n_components=0.95)),
                  ('clf', SVC(kernel='linear', class_weight='balanced', probability=True))])
pca99 = Pipeline([('pca', PCA(n_components=0.99)),
                  ('clf', SVC(kernel='linear', class_weight='balanced', probability=True))])
pca995 = Pipeline([('pca', PCA(n_components=0.995)),
                  ('clf', SVC(kernel='linear', class_weight='balanced', probability=True))])
grid_pca = {'clf__C': np.geomspace(0.01, 100, 5).tolist()}

## Evaluate

In [None]:
Tools.evaluate(inputs, {'datum': 'Scale', 'label_encode': 'LabelEncode'}, pca95, 'PCA95_SVM', grid=grid_pca)
Tools.evaluate(inputs, {'datum': 'Scale', 'label_encode': 'LabelEncode'}, pca99, 'PCA99_SVM', grid=grid_pca)
Tools.evaluate(inputs, {'datum': 'Scale', 'label_encode': 'LabelEncode'}, pca99, 'PCA995_SVM', grid=grid_pca)

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'PCA95_SVM'}, label_encoder),
                                    title='Test - PCA 95% classification performance'))

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'PCA99_SVM'}, label_encoder),
                                    title='Test - PCA 99% classification performance'))

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'PCA995_SVM'}, label_encoder),
                                    title='Test - PCA 99.5% classification performance'))


In [None]:
Views.receiver_operator_curves(ViewsTools.data_as(inputs, 'PCA99_SVM'), label_encoder, {'label_encode': 'LabelEncode', 'eval': 'PCA99_SVM'}, settings);

# DWT and Bags Evaluation

## Transform

In [None]:
Tools.transform(inputs, {'datum': 'Scale'}, DWTTransform(mode='db6', segment=80), 'DWT')

## Model

In [None]:
pipe_ahmed = Pipeline([('kmeans', KMeans(n_clusters=92, max_iter=50)),
                       ('clf', SVC(kernel='linear', class_weight='balanced', probability=True))])
grid_ahmed = {'clf__C': np.geomspace(0.01, 100, 5).tolist()}

In [None]:
Tools.evaluate(inputs, {'datum': 'DWT', 'label_encode': 'LabelEncode'}, pipe_ahmed, 'DWT_SVM', grid=grid_ahmed)

In [None]:
HTML(ViewsTools.dataframe_renderer(Views.report(inputs, {'label_encode': 'LabelEncode', 'eval': 'DWT_SVM'}, label_encoder),
                                    title='Test - DWT+BOW classification performance'))

In [None]:
Views.receiver_operator_curves(ViewsTools.data_as(inputs, 'DWT_SVM'), label_encoder, {'label_encode': 'LabelEncode', 'eval': 'DWT_SVM'}, settings);

# Distribution Evaluation

## Analysis

In [None]:
SignalsViews.histogram(inputs, {'datum': 'Datum', 'label': 'Label'}, settings);
SignalsViews.histogram(inputs, {'datum': 'Datum', 'label': 'Label'}, settings, mode='std');

## Transform

In [None]:
fit = FittingTransform().fit(np.array(inputs['Scale'].tolist()))
Tools.transform(inputs, {'datum': 'Scale'}, fit, 'Fit')

## Evaluate

In [None]:
Tools.evaluate(inputs, {'datum': 'Fit', 'label_encode': 'LabelEncode'}, model, 'Fit_SVM', grid=grid)

In [None]:
Views.report(ViewsTools.data_as(inputs, 'Fit_SVM'), {'label_encode': 'LabelEncode', 'eval': 'Fit_SVM'}, label_encoder)

In [None]:
Views.report(ViewsTools.data_as(inputs, 'Fit_SVM', as_train=True), {'label_encode': 'LabelEncode', 'eval': 'Fit_SVM'}, label_encoder)

In [None]:
import scipy.stats as st
# plt.plot(inputs['Wavelength'][0], inputs['Datum'][0])
hist, bins = np.histogram(inputs['Scale'][7], bins='auto', density=True)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)

In [None]:
inputs