In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

# Evaluación
Esta función se usará para calcular f1 y accuracy  

In [3]:
def eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}

In [28]:
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

from tass import InterTASSReader
from classifier import SentimentClassifier


import numpy as np

def grid_search(classifier):
    sentimentClassifier = SentimentClassifier(clf = classifier)

    pipeline = sentimentClassifier._pipeline
    tassDev = InterTASSReader("InterTASS/ES/intertass-ES-development-tagged.xml")
    X_dev = list(tassDev.X())
    y_dev = list(tassDev.y())

    tassTrain = InterTASSReader("InterTASS/ES/intertass-ES-train-tagged.xml")
    X_train = list(tassTrain.X())
    y_train = list(tassTrain.y())


    param_grid = [{
        'clf__C': [2**-4,2**-3,2**-2,2**-1,2**0,2**1,2**2,2**3,2**4, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__penalty': ['l2'],
        'clf__dual': [True, False],
    },{
        'clf__C': [2**-4,2**-3,2**-2,2**-1,2**0,2**1,2**2,2**3,2**4, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__penalty': ['l1'],
        'clf__dual': [False],
    }]

    params_list = list(ParameterGrid(param_grid))
    results = []
    i = 0
    for params in log_progress(params_list, every=1):
        pipeline.set_params(**params)
        pipeline.fit(X_train, y_train)
        result = eval(pipeline, X_dev, y_dev)

        results.append({
            **result,
            **params,
        })
    return results


In [29]:
import pandas as pd

results_svm = grid_search('svm')
results_maxent = grid_search('maxent')

results_svm_df = pd.DataFrame(results_svm)
results_maxent_df = pd.DataFrame(results_maxent)

VBox(children=(HTML(value=''), IntProgress(value=0, max=51)))

  'precision', 'predicted', average, warn_for)


VBox(children=(HTML(value=''), IntProgress(value=0, max=51)))



# Resultados para LinearSV ordenados por f1

In [30]:
display(results_svm_df.sort_values('f1', ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
11,0.539526,2.0,False,l2,0.423606
10,0.539526,2.0,True,l2,0.423606
6,0.549407,0.5,True,l2,0.423329
7,0.549407,0.5,False,l2,0.423329
13,0.539526,4.0,False,l2,0.423267
8,0.541502,1.0,True,l2,0.421447
27,0.541502,1.0,False,l2,0.421447
9,0.541502,1.0,False,l2,0.421447
26,0.541502,1.0,True,l2,0.421447
12,0.535573,4.0,True,l2,0.420413


# Resultados para LogisticRegression ordenados por f1

In [31]:
display(results_maxent_df.sort_values('f1', ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
33,0.543478,1000.0,False,l2,0.416334
32,0.541502,1000.0,True,l2,0.415212
31,0.547431,100.0,False,l2,0.414783
30,0.547431,100.0,True,l2,0.414783
16,0.547431,16.0,True,l2,0.410773
17,0.547431,16.0,False,l2,0.410773
28,0.547431,10.0,True,l2,0.407613
29,0.547431,10.0,False,l2,0.407613
15,0.541502,8.0,False,l2,0.399743
14,0.541502,8.0,True,l2,0.399743


# Resultados para LinearSV ordenados por acc

In [32]:
display(results_svm_df.sort_values('acc', ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
22,0.567194,0.01,True,l2,0.348826
23,0.567194,0.01,False,l2,0.348826
2,0.549407,0.125,True,l2,0.391638
3,0.549407,0.125,False,l2,0.391638
6,0.549407,0.5,True,l2,0.423329
7,0.549407,0.5,False,l2,0.423329
25,0.545455,0.1,False,l2,0.378712
35,0.545455,0.125,False,l1,0.351216
24,0.545455,0.1,True,l2,0.378712
46,0.541502,0.1,False,l1,0.340596


# Resultados para LogisticRegression ordenados por acc

In [33]:
display(results_maxent_df.sort_values('acc', ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
25,0.563241,0.1,False,l2,0.340767
2,0.563241,0.125,True,l2,0.352221
3,0.563241,0.125,False,l2,0.352221
24,0.563241,0.1,True,l2,0.340767
4,0.557312,0.25,True,l2,0.354104
5,0.557312,0.25,False,l2,0.354104
16,0.547431,16.0,True,l2,0.410773
31,0.547431,100.0,False,l2,0.414783
30,0.547431,100.0,True,l2,0.414783
29,0.547431,10.0,False,l2,0.407613


# Resultados para LinearSV ordenados por acc, f1

In [34]:
display(results_svm_df.sort_values(['acc', 'f1'], ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
22,0.567194,0.01,True,l2,0.348826
23,0.567194,0.01,False,l2,0.348826
6,0.549407,0.5,True,l2,0.423329
7,0.549407,0.5,False,l2,0.423329
2,0.549407,0.125,True,l2,0.391638
3,0.549407,0.125,False,l2,0.391638
24,0.545455,0.1,True,l2,0.378712
25,0.545455,0.1,False,l2,0.378712
35,0.545455,0.125,False,l1,0.351216
8,0.541502,1.0,True,l2,0.421447


# Resultados para LogisticRegression ordenados por acc, f1

In [35]:
display(results_maxent_df.sort_values(['acc', 'f1'], ascending=False)[:10])

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
2,0.563241,0.125,True,l2,0.352221
3,0.563241,0.125,False,l2,0.352221
24,0.563241,0.1,True,l2,0.340767
25,0.563241,0.1,False,l2,0.340767
4,0.557312,0.25,True,l2,0.354104
5,0.557312,0.25,False,l2,0.354104
30,0.547431,100.0,True,l2,0.414783
31,0.547431,100.0,False,l2,0.414783
16,0.547431,16.0,True,l2,0.410773
17,0.547431,16.0,False,l2,0.410773
