# Data mining assignment

In [40]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
import numpy as np
import collections
import random
# Visualization
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

## Exceptions

In [2]:
class UnknownClassifier(Exception):
    def __init__(self):
        super().__init__('Unknown classifier name')

## Dataset split

In [3]:
def split_dataset(N, ratio):
    indexes = [i for i in range(N)]
    limit = int(N * ratio)
    # 3N scambi casuali
    for i in range(3*N):
        a = random.randrange(N)
        b = random.randrange(N)
        indexes[a], indexes[b] = indexes[b], indexes[a]
    return indexes[limit:], indexes[:limit]

## Classifiers creation

The classifiers that we will consider are:
- Decision Tree ([link](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html))
- SVC ([link](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
- Gaussian process classifier ([link](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html))
- MLP ([link](earn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html))

Which have been taken from this [list](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)

In [23]:
def create_dt(parameters):
    criterion = 'gini'
    if 'criterion' in parameters:
        criterion = parameters['criterion']
    splitter = 'best'
    if 'splitter' in parameters:
        splitter = parameters['splitter']
    max_depth = None
    if 'max_depth' in parameters:
            max_depth = parameters['max_depth']
    min_samples_split = 2
    if 'min_samples_split' in parameters:
            min_samples_split = parameters['min_samples_split']
    min_samples_leaf = 1
    if 'min_samples_leaf' in parameters:
            min_samples_leaf = parameters['min_samples_leaf']
    min_weight_fraction_leaf = 0.0
    if 'min_weight_fraction_leaf' in parameters:
            min_weight_fraction_leaf = parameters['min_weight_fraction_leaf']
    max_features = None
    if 'max_features' in parameters:
            max_features = parameters['max_features']
    random_state = None
    if 'random_state' in parameters:
            random_state = parameters['random_state']
    max_leaf_nodes = None
    if 'max_leaf_nodes' in parameters:
            max_leaf_nodes = parameters['max_leaf_nodes']
    min_impurity_decrease = 0.0
    if 'min_impurity_decrease' in parameters:
            min_impurity_decrease = parameters['min_impurity_decrease']
    class_weight = None
    if 'class_weight' in parameters:
            class_weight = parameters['class_weight']
    ccp_alpha = 0.0
    if 'ccp_alpha' in parameters:
            ccp_alpha = parameters['ccp_alpha']

    return DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        random_state=random_state,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        class_weight=class_weight,
        ccp_alpha=ccp_alpha,
    )

In [37]:
def create_svc(parameters):
    C = 1.0
    if 'C' in parameters:
        C = parameters['C']
    kernel = 'rbf'
    if 'kernel' in parameters:
        kernel = parameters['kernel']
    degree = 3
    if 'degree' in parameters:
        degree = parameters['degree']
    gamma = 'scale'
    if 'gamma' in parameters:
        gamma = parameters['gamma']
    coef0 = 0.0
    if 'coef0' in parameters:
        coef0 = parameters['coef0']
    shrinking = True
    if 'shrinking' in parameters:
        shrinking = parameters['shrinking']
    probability = False
    if 'probability' in parameters:
        probability = parameters['probability']
    tol = 1e-3
    if 'tol' in parameters:
        tol = parameters['tol']
    cache_size = 200
    if 'cache_size' in parameters:
        cache_size = parameters['cache_size']
    class_weight = None
    if 'class_weight' in parameters:
        class_weight = parameters['class_weight']
    verbose = False
    if 'verbose' in parameters:
        verbose = parameters['verbose']
    max_iter = -1
    if 'max_iter' in parameters:
        max_iter = parameters['max_iter']
    decision_function_shape = 'ovr'
    if 'decision_function_shape' in parameters:
        decision_function_shape = parameters['decision_function_shape']
    break_ties = False
    if 'break_ties' in parameters:
        break_ties = parameters['break_ties']
    random_state = None
    if 'random_state' in parameters:
        random_state = parameters['random_state']

    return SVC(
        C=C,
        kernel=kernel,
        degree=degree,
        gamma=gamma,
        coef0=coef0,
        shrinking=shrinking,
        probability=probability,
        tol=tol, cache_size=cache_size,
        class_weight=class_weight,
        verbose=verbose,
        max_iter=max_iter,
        decision_function_shape=decision_function_shape,
        break_ties=break_ties,
        random_state=random_state
    )

In [29]:
def create_gp(parameters):
    kernel = None
    if 'kernel' in parameters:
            kernel = parameters['kernel']
    optimizer = 'fmin_l_bfgs_b'
    if 'optimizer' in parameters:
            optimizer = parameters['optimizer']
    n_restarts_optimizer = 0
    if 'n_restarts_optimizer' in parameters:
            n_restarts_optimizer = parameters['n_restarts_optimizer']
    max_iter_predict = 100
    if 'max_iter_predict' in parameters:
            max_iter_predict = parameters['max_iter_predict']
    warm_start = False
    if 'warm_start' in parameters:
            warm_start = parameters['warm_start']
    copy_X_train = True
    if 'copy_X_train' in parameters:
            copy_X_train = parameters['copy_X_train']
    random_state = None
    if 'random_state' in parameters:
            random_state = parameters['random_state']
    multi_class = 'one_vs_rest'
    if 'multi_class' in parameters:
            multi_class = parameters['multi_class']
    n_jobs = None
    if 'n_jobs' in parameters:
            n_jobs = parameters['n_jobs']

    return GaussianProcessClassifier(
        kernel=kernel,
        optimizer=optimizer,
        n_restarts_optimizer=n_restarts_optimizer,
        max_iter_predict=max_iter_predict,
        warm_start=warm_start,
        copy_X_train=copy_X_train,
        random_state=random_state,
        multi_class=multi_class,
        n_jobs=n_jobs
    )

In [44]:
def create_mlp(parameters):
    hidden_layer_sizes = (100,)
    if 'hidden_layer_sizes' in parameters:
            hidden_layer_sizes = parameters['hidden_layer_sizes']
    activation = 'relu'
    if 'activation' in parameters:
            activation = parameters['activation']
    solver = 'adam'
    if 'solver' in parameters:
            solver = parameters['solver']
    alpha = 0.0001
    if 'alpha' in parameters:
            alpha = parameters['alpha']
    batch_size = 'auto'
    if 'batch_size' in parameters:
            batch_size = parameters['batch_size']
    learning_rate = 'constant'
    if 'learning_rate' in parameters:
            learning_rate = parameters['learning_rate']
    learning_rate_init = 0.001
    if 'learning_rate_init' in parameters:
            learning_rate_init = parameters['learning_rate_init']
    power_t = 0.5
    if 'power_t' in parameters:
            power_t = parameters['power_t']
    max_iter = 200
    if 'max_iter' in parameters:
            max_iter = parameters['max_iter']
    shuffle = True
    if 'shuffle' in parameters:
            shuffle = parameters['shuffle']
    random_state = None
    if 'random_state' in parameters:
            random_state = parameters['random_state']
    tol = 1e-4
    if 'tol' in parameters:
            tol = parameters['tol']
    verbose = False
    if 'verbose' in parameters:
            verbose = parameters['verbose']
    warm_start = False
    if 'warm_start' in parameters:
            warm_start = parameters['warm_start']
    momentum = 0.9
    if 'momentum' in parameters:
            momentum = parameters['momentum']
    nesterovs_momentum = True
    if 'nesterovs_momentum' in parameters:
            nesterovs_momentum = parameters['nesterovs_momentum']
    early_stopping = False
    if 'early_stopping' in parameters:
            early_stopping = parameters['early_stopping']
    validation_fraction = 0.1
    if 'validation_fraction' in parameters:
            validation_fraction = parameters['validation_fraction']
    beta_1 = 0.9
    if 'beta_1' in parameters:
            beta_1 = parameters['beta_1']
    beta_2 = 0.999
    if 'beta_2' in parameters:
            beta_2 = parameters['beta_2']
    epsilon = 1e-8
    if 'epsilon' in parameters:
            epsilon = parameters['epsilon']
    n_iter_no_change = 10
    if 'n_iter_no_change' in parameters:
            n_iter_no_change = parameters['n_iter_no_change']
    max_fun = 15000
    if 'max_fun' in parameters:
            max_fun = parameters['max_fun']
            
    return MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        batch_size=batch_size,
        learning_rate=learning_rate,
        learning_rate_init=learning_rate_init,
        power_t=power_t,
        max_iter=max_iter,
        shuffle=shuffle,
        random_state=random_state,
        tol=tol,
        verbose=verbose,
        warm_start=warm_start,
        momentum=momentum,
        nesterovs_momentum=nesterovs_momentum,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        beta_1=beta_1,
        beta_2=beta_2,
        epsilon=epsilon,
        n_iter_no_change=n_iter_no_change,
        max_fun=max_fun
    )

In [45]:
def create_classifier(name, parameters):
    if name == 'DecisionTree':
        return create_dt(parameters)
    elif name == 'SVC':
        return create_svc(parameters)
    elif name == 'GaussianProcess':
        return create_gp(parameters)
    elif name == 'MLP':
        return create_mlp(parameters)
    else:
        raise UnknownClassifier()

## 1 Fold and Confusion matrixes

In [None]:
def create_cm_1_fold(classifier, dataset, features, labels):
    true_positive, true_negative, false_positive, false_negative = 0, 0, 0, 0
    for i in range(len(dataset)):
        # Create array [0..N] excluding value i
        if i == 0:
            indexes = np.array(range(1,N))
        elif i < len(dataset) - 1:
            indexes = np.append(np.array(range(0,i)), np.array(range(i + 1,N)))
        else:
            indexes = np.array(range(0,N - 1))
        # Train the classifier with indexes values
        model = classifier.fit(dataset)

## Main functions

In [50]:
def train(
        dataset, 
        class_attribute, 
        test_set_ratio, 
        positive_values,
        target_metrics,
        select_threshold,
        max_number,
        classifier_list
    ):
    ds_train, ds_test = split_dataset(len(dataset), test_set_ratio)
    # Extract from the dataset col_names, and build FEATURES and LABELS
    
    
    classifiers = []
    for classifier_spec in classifier_list:
        classifiers.append(create_classifier(classifier_spec[0], classifier_spec[1]))
    
    confusion_matrixes = []
    for classifier in classifiers:
        confusion_matrixes.append(create_cm_1_fold(classifier, ds_train))
    
    return confusion_matrixes

In [47]:
def predict(instance, classifiers):
    pass

In [48]:
classifiers = train(
    range(0,100), 
    None, 
    0.3, 
    None, 
    None, 
    None, 
    None, 
    [
        ['DecisionTree', {'max_depth': 5, 'criterion': 'entropy'}],
        ['DecisionTree', {'min_samples_split': 25, 'max_features': 10, 'criterion': 'gini'}],
        ['SVC', {'kernel': 'rbf', 'degree': 9}],
        ['GaussianProcess', {}],
        ['MLP', {}]
    ])

In [49]:
classifiers

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                        max_depth=5, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=10, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=25,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=9, gamma='scal