In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Eventually, for Anaconda warnings.
# Can be commented out.
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Load basic libraries
import seaborn; seaborn.set()
from itertools import chain
import pickle, copy, random
random.seed(100)
import numpy as np
import scipy.stats

In [3]:
# load data and dump it
from loader import load_is_asve_dataset
data = load_is_asve_dataset()

pickle.dump(data,open("data.p","wb"))

In [3]:
data = pickle.load(open("data.p","rb"))

In [4]:
# check data
print(len(data))
print(len([x for x,y in data.items() if y["y"]==0]))
print(len([x for x,y in data.items() if y["y"]==1]))

67954
46961
20993


In [5]:
data['58e4ce01fe7683152b59e56e']

{'asve': 'IT:ASVe:0645',
 'components': [(1, 'A.S.V.,', 'archive'),
  (2, 'Provveditori sopra Feudi,', 'archivalreference'),
  (3, 'busta 203.', 'box')],
 'disamb_id': '5917363bb9123b1cd37b1e11',
 'ref_type': 'primary',
 'surface': 'A.S.V., Provveditori sopra Feudi, busta 203.',
 'y': 1}

In [6]:
# sample out a sample of negative references
final_data = {x:y for x,y in data.items() if y["y"]==1}
negatives = [x for x,y in data.items() if y["y"]==0]
negatives = random.sample(negatives, len(final_data))
final_data.update({x:y for x,y in data.items() if x in negatives})

In [7]:
from supporting_functions import cleanup
data_docs = [cleanup(x["surface"]) for x in final_data.values()]
data_targets = [x["y"] for x in final_data.values()]

In [8]:
data_docs[11]

'prow art. b.'

In [9]:
data_targets[11]

1

In [10]:
# train test
from sklearn.model_selection import train_test_split
# TRAIN/TEST
X_train, X_test, y_train, y_test = train_test_split(data_docs, data_targets, test_size=0.25)

In [11]:
# baseline model
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', MultinomialNB())])

In [12]:
text_clf = text_clf.fit(X_train, y_train)

In [13]:
base_predicted = text_clf.predict(X_test)
probs = text_clf.predict_proba(X_test)
np.mean(base_predicted == y_test)

0.89987615509193097

In [19]:
text_clf.predict(['ASVe, Savi all Sanit'])[0]

1

In [16]:
# only keep high confidence predictions, to maximize precision
threshold = 0.9
predicted = list()
for p,prob in zip(base_predicted,probs):
    print(prob)
    print(p)
    if prob[1] < threshold:
        predicted.append(0)
    else:
        predicted.append(1)

[ 0.54602565  0.45397435]
0
[ 0.8323675  0.1676325]
0
[ 0.01833097  0.98166903]
1
[ 0.01959221  0.98040779]
1
[ 0.88422422  0.11577578]
0
[ 0.0021587  0.9978413]
1
[ 0.01679972  0.98320028]
1
[ 0.02811094  0.97188906]
1
[ 0.95190025  0.04809975]
0
[ 0.52394147  0.47605853]
0
[ 0.99868045  0.00131955]
0
[ 0.01491293  0.98508707]
1
[ 0.50154022  0.49845978]
0
[ 0.56250037  0.43749963]
0
[ 0.27277984  0.72722016]
1
[  8.77764307e-04   9.99122236e-01]
1
[ 0.82483733  0.17516267]
0
[ 0.86149354  0.13850646]
0
[ 0.0317808  0.9682192]
1
[ 0.06065165  0.93934835]
1
[ 0.08339564  0.91660436]
1
[ 0.12860685  0.87139315]
1
[ 0.89518164  0.10481836]
0
[ 0.10936198  0.89063802]
1
[ 0.04966057  0.95033943]
1
[ 0.00203255  0.99796745]
1
[ 0.85434435  0.14565565]
0
[ 0.002426  0.997574]
1
[ 0.6850816  0.3149184]
0
[ 0.11984123  0.88015877]
1
[ 0.10518834  0.89481166]
1
[ 0.35398403  0.64601597]
1
[ 0.15337231  0.84662769]
1
[ 0.81127284  0.18872716]
0
[ 0.91257384  0.08742616]
0
[ 0.81916006  0.180839

In [15]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.76      0.98      0.86      5295
          1       0.97      0.69      0.81      5202

avg / total       0.87      0.84      0.83     10497



In [16]:
# Grid search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
                'tfidf__use_idf': (True, False),
                'clf__alpha': (1, 0.5, 1e-1, 1e-2)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-3)
gs_clf = gs_clf.fit(X_train,y_train)

In [17]:
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.885325034139
clf__alpha: 0.1
tfidf__use_idf: True
vect__ngram_range: (1, 5)


In [18]:
# final baseline model
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,5))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf', MultinomialNB(alpha=0.1))])


In [19]:
text_clf = text_clf.fit(X_train, y_train)


In [20]:
base_predicted = text_clf.predict(X_test)
probs = text_clf.predict_proba(X_test)
np.mean(base_predicted == y_test)

0.8903496237020101

In [25]:
# only keep high confidence predictions, to maximize precision
threshold = 0.95
predicted = list()
for p,prob in zip(base_predicted,probs):
    if prob[1] < threshold:
        predicted.append(0)
    else:
        predicted.append(1)

In [26]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.88      0.97      0.92      5295
          1       0.97      0.87      0.92      5202

avg / total       0.92      0.92      0.92     10497



In [27]:
# train final
clf_final = Pipeline([('vect', CountVectorizer(ngram_range=(1,5))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf', MultinomialNB(alpha=0.1))])
clf_final = clf_final.fit(data_docs, data_targets)

In [28]:
# persist model
from sklearn.externals import joblib
joblib.dump(clf_final, 'models/is_asve.pkl')

['models/is_asve.pkl']

In [111]:
from sklearn import metrics
print(metrics.classification_report(y_test, base_predicted))

                              precision    recall  f1-score   support

                IT:ASVe:0005       0.00      0.00      0.00         6
            IT:ASVe:0005.001       0.00      0.00      0.00         1
                IT:ASVe:0010       1.00      0.21      0.34        39
            IT:ASVe:0010.001       0.70      0.85      0.76        71
            IT:ASVe:0010.011       0.00      0.00      0.00         1
                IT:ASVe:0015       0.00      0.00      0.00         8
            IT:ASVe:0015.001       0.18      0.50      0.27         4
        IT:ASVe:0015.001.001       0.00      0.00      0.00         6
        IT:ASVe:0015.001.002       0.00      0.00      0.00         6
        IT:ASVe:0015.001.003       0.00      0.00      0.00         2
            IT:ASVe:0015.002       1.00      0.38      0.55        21
        IT:ASVe:0015.002.002       0.00      0.00      0.00         1
    IT:ASVe:0015.002.005.001       0.00      0.00      0.00         1
        IT:ASVe:001