In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Eventually, for Anaconda warnings.
# Can be commented out.
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [30]:
# Load basic libraries
import seaborn; seaborn.set()
from itertools import chain
from collections import OrderedDict
import pickle, copy, random
random.seed(100)
import numpy as np
import scipy.stats

In [3]:
data = pickle.load(open("data.p","rb"))

In [4]:
# check data
print(len(data))
print(len([x for x,y in data.items() if y["y"]==0]))
print(len([x for x,y in data.items() if y["y"]==1]))

67954
46961
20993


In [5]:
data['58e4ce01fe7683152b59e56e']

{'asve': 'IT:ASVe:0645',
 'components': [(1, 'A.S.V.,', 'archive'),
  (2, 'Provveditori sopra Feudi,', 'archivalreference'),
  (3, 'busta 203.', 'box')],
 'disamb_id': '5917363bb9123b1cd37b1e11',
 'ref_type': 'primary',
 'surface': 'A.S.V., Provveditori sopra Feudi, busta 203.',
 'y': 1}

In [6]:
from supporting_functions import cleanup
data_docs = [cleanup(x["surface"]) for x in data.values() if x["y"]==1]
data_targets = [x["asve"] for x in data.values() if x["y"]==1]

In [7]:
data_docs[11]

'schulemburg b. febbraio'

In [8]:
data_targets[11]

'IT:ASVe:0275'

In [28]:
# cleanup targets
# calculate a dict of occurrences
# replace infrequent observations but if possibly consolidate to a higher level
freq_dict = {x:len([z for z in data_targets if z==x]) for x in data_targets}

In [33]:
freq_dict = OrderedDict(sorted(freq_dict.items(),key=lambda x:x[1]))

In [34]:
threshold = 5
discard_list = list()
consolidate_dict = {x:x for x in freq_dict.keys()}
for x,y in freq_dict.items():
    if y < threshold:
        if len(x.split(".")) == 1:
            discard_list.append(x)
        else:
            for n in range(1,len(x.split("."))):
                new_id = ".".join(x.split(".")[:-n])
                if new_id in freq_dict.keys() and freq_dict[new_id] >= threshold:
                    consolidate_dict[x] = new_id
                    break

In [41]:
# consolidate
data_docs = [cleanup(x["surface"]) for x in data.values() if x["y"]==1 and not x["asve"] in discard_list]
data_targets = [consolidate_dict[x["asve"]] for x in data.values() if x["y"]==1 and not x["asve"] in discard_list]

In [44]:
# train test
from sklearn.model_selection import train_test_split
# TRAIN/TEST
X_train, X_test, y_train, y_test = train_test_split(data_docs, data_targets, test_size=0.25)

In [16]:
# baseline model
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
#text_clf = Pipeline([('vect', CountVectorizer()),
#                        ('tfidf', TfidfTransformer()),
#                        ('clf', MultinomialNB())])
#text_clf = Pipeline([('vect', CountVectorizer()),
#                        ('tfidf', TfidfTransformer()),
#                        ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
#text_clf = Pipeline([('vect', CountVectorizer()),
#                        ('tfidf', TfidfTransformer()),
#                        ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, n_jobs=1, random_state=100))])

In [11]:
text_clf = text_clf.fit(X_train, y_train)

In [12]:
base_predicted = text_clf.predict(X_test)
probs = text_clf.predict_proba(X_test)
np.mean(base_predicted == y_test)

0.89655172413793105

In [21]:
# Grid search
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', LinearSVC())])
parameters = {'vect__ngram_range': [(1,1),(1,3),(1,5)],
                'tfidf__use_idf': (True, False),
                  'clf__dual': (True,False),
                 'clf__C': (1,0.1,0.01,0.001)}
"""
text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', RandomForestClassifier())])
parameters = {'vect__ngram_range': [(1,1),(1,2),(1, 3), (1, 4), (1, 5)],
                'tfidf__use_idf': (True, False),
                'clf__n_estimators': (10,50,100,500),
                 'clf__criterion': ('gini'),
                 'clf__max_depth': (None,5,10,50)}
"""
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=2)
gs_clf = gs_clf.fit(X_train,y_train)

In [22]:
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.913490853659
clf__C: 1
clf__dual: True
tfidf__use_idf: True
vect__ngram_range: (1, 3)


In [45]:
# final baseline model
#text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
#                        ('tfidf', TfidfTransformer(use_idf=True)),
#                        ('clf', MultinomialNB(alpha=0.01))])
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf', LinearSVC(C=1,dual=True))])

In [46]:
text_clf = text_clf.fit(X_train, y_train)

In [47]:
base_predicted = text_clf.predict(X_test)
np.mean(base_predicted == y_test)

0.93145937620331154

In [48]:
from sklearn import metrics
print(metrics.classification_report(y_test, base_predicted))

                              precision    recall  f1-score   support

                IT:ASVe:0005       1.00      0.80      0.89         5
                IT:ASVe:0010       0.94      0.94      0.94        34
            IT:ASVe:0010.001       0.95      0.89      0.92        45
                IT:ASVe:0015       0.60      0.38      0.46         8
            IT:ASVe:0015.001       0.76      1.00      0.87        13
        IT:ASVe:0015.001.001       0.83      0.62      0.71         8
        IT:ASVe:0015.001.002       0.80      1.00      0.89         4
        IT:ASVe:0015.001.003       1.00      1.00      1.00         3
            IT:ASVe:0015.002       0.96      0.88      0.92        26
            IT:ASVe:0015.003       0.70      1.00      0.82         7
            IT:ASVe:0015.004       1.00      1.00      1.00        13
        IT:ASVe:0015.004.001       1.00      0.50      0.67         4
            IT:ASVe:0020.001       1.00      1.00      1.00         1
                IT:

In [49]:
# train final
clf_final = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf', LinearSVC(C=1,dual=True))])
clf_final = clf_final.fit(data_docs, data_targets)

In [50]:
# persist model
from sklearn.externals import joblib
joblib.dump(clf_final, 'models/asve_ids.pkl')

['models/asve_ids.pkl']

In [53]:
clf_final.predict(["ciao mamma"])[0]

'IT:ASVe:0040.005.020'