# Setup
---

In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt

from preprocess import preprocess_all
from util import read_bills, read_stopwords, split_ttf
from variants import variant_i, variant_ii, variant_iii, variant_iv

from fasttext import prepare_files

In [5]:
warnings.filterwarnings('ignore')

In [6]:
bills = read_bills('../lab1/data')

In [7]:
df = preprocess_all(bills)

Could not resolve header for 1996_400.txt


In [8]:
df['is_amendment'].describe()

count     1178
unique       2
top       True
freq       610
Name: is_amendment, dtype: object

In [9]:
train, test, validation = split_ttf(df)

In [10]:
print('train      {}'.format(len(train)))
print('test       {}'.format(len(test)))
print('validation {}'.format(len(validation)))

train      706
test       236
validation 236


In [11]:
variants = [
    ('i', variant_i),
    ('ii', variant_ii),
    ('iii', variant_iii),
    ('iv', variant_iv)
]

In [14]:
variant_dfs = {}

for name, variant in variants:
    v_train=variant(train)
    v_validation=variant(validation)
    v_test=variant(test)
    
    variant_dfs[name] = (v_train, v_validation, v_test)
    
    prepare_files(v_train, 'fast/{}_train.csv'.format(name))
    prepare_files(v_validation, 'fast/{}_validation.csv'.format(name))
    prepare_files(v_test, 'fast/{}_test.csv'.format(name))

# SVM + TF-IDF
---

## Analysis

In [238]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from preprocess import extract_tokens
from util import build_vocabulary
from svm import teach_svm, evaluate, svm_show_scores
from vis import plot_metrics_for

In [15]:
stopwords = read_stopwords('./stopwords-pl.txt')

In [267]:
vocabulary = build_vocabulary(df, drop_threshold=1000, drop_exceptions=['"ust.', '"art.'])

In [16]:
train.describe()

Unnamed: 0,bill_file,text,is_amendment
count,706,706,706
unique,706,706,2
top,1997_684.txt,Art. 1.\n\nW ustawie z dnia 4 marca 1994 r. o ...,True
freq,1,1,364


In [294]:
clf, clf_metric, metrics = teach_svm(train, validation, df, stopwords)

Score 0.4915254237288136 for tol=1e-11, c=0.1, kernel=rbf, drop=0


In [299]:
vocabulary = build_vocabulary(df, drop_threshold=0, drop_exceptions=['"ust.', '"art.'])
svm_show_scores(clf, test, vocabulary, stopwords)

Precision: 0.3034329215742603
Recall:    0.5508474576271186
F1 score:  0.3913124015930351


In [None]:
plot_metrics_for('C', metrics, kernels=['rbf'])
plot_metrics_for('tol', metrics, kernels=['rbf'])
plot_metrics_for('drop_threshold', metrics, kernels=['rbf'])

## Results

In [131]:
from variants import *

# Fasttext
---

In [231]:
import fastText

from fasttext import prepare_files, teach_fasttext, fasttext_show_scores

In [235]:
fasttext_results = {}
for name, (v_train, v_validation, v_test) in variant_dfs.items():
    print('Training classifier: {}'.format(name))
    clf, metrics = teach_fasttext('fast/{}_train.csv'.format(name), 'fast/{}_validation.csv'.format(name))
    fasttext_results[name] = (clf, metrics)
    print('------------------------------------')

Training classifier: i
Score 0.5847457627118644 for lr=0.1, wordNgrams=1
Score 0.5889830508474576 for lr=0.1, wordNgrams=2
Score 0.826271186440678 for lr=0.6, wordNgrams=1
Score 0.8601694915254238 for lr=1.1, wordNgrams=1
Score 0.8728813559322034 for lr=1.6, wordNgrams=1
Score 0.8771186440677965 for lr=2.1, wordNgrams=2
Score 0.8813559322033898 for lr=2.1, wordNgrams=3
Score 0.885593220338983 for lr=2.6, wordNgrams=2
Score 0.8898305084745762 for lr=3.1, wordNgrams=3
------------------------------------
Training classifier: ii
Score 0.559322033898305 for lr=0.1, wordNgrams=1
Score 0.5635593220338984 for lr=0.1, wordNgrams=3
Score 0.75 for lr=0.6, wordNgrams=1
Score 0.7584745762711863 for lr=1.1, wordNgrams=2
Score 0.7796610169491526 for lr=1.6, wordNgrams=2
------------------------------------
Training classifier: iii
Score 0.5847457627118644 for lr=0.1, wordNgrams=1
Score 0.5889830508474576 for lr=0.1, wordNgrams=2
Score 0.614406779661017 for lr=0.1, wordNgrams=3
Score 0.70338983050847

In [237]:
for name, (clf, _) in fasttext_results.items():
    print("Variant {}".format(name))
    _, _, test = variant_dfs[name]
    fasttext_show_scores(clf, test)
    print('-----------------------------')

Variant i
Precision: 0.8940040293514223
Recall:    0.8940677966101694
F1 score:  0.8940197076870566
-----------------------------
Variant ii
Precision: 0.7623735727549287
Recall:    0.7627118644067796
F1 score:  0.7624876008107991
-----------------------------
Variant iii
Precision: 0.7976569569789907
Recall:    0.7966101694915254
F1 score:  0.7969044256120528
-----------------------------
Variant iv
Precision: 0.6158098972486198
Recall:    0.6186440677966102
F1 score:  0.6156156530408775
-----------------------------


# Flair 
---

In [15]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings

from pathlib import Path

ModuleNotFoundError: No module named 'flair.data_fetcher'; 'flair' is not a package

In [301]:
corpus = NLPTaskDataFetcher.load_classification_corpus(
    Path('./fast'),
    test_file='i_test.csv',
    dev_file='i_validation.csv',
    train_file='i_train.csv')

word_embeddings = [
    WordEmbeddings('pl')
]

2019-05-13 15:17:17,469 Reading data from fast
2019-05-13 15:17:17,471 Train: fast/i_train.csv
2019-05-13 15:17:17,472 Dev: fast/i_validation.csv
2019-05-13 15:17:17,473 Test: fast/i_test.csv


In [None]:
document_embeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256)

[autoreload of torch failed: Traceback (most recent call last):
  File "/usr/local/Cellar/ipython/7.2.0/libexec/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 244, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/Cellar/ipython/7.2.0/libexec/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 376, in superreload
    module = reload(module)
  File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/usr/local/lib/python3.7/site-packages/torch/

In [305]:
classifier = TextClassifier(
    document_embeddings,
    label_dictionary=corpus.make_label_dictionary(),
    multi_label=False)

trainer = ModelTrainer(classifier, corpus)
trainer.train('./fast', max_epochs=10)

NameError: name 'document_embeddings' is not defined