In [1]:
import functools
import glob
import heapq
import itertools
import json
import operator
import os
import pickle
import re
from collections import Counter, namedtuple

import loglikelihood
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
def create_dir_if_doesnt_exist(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


YEAR = 2009
PATH = 'F:\\saos\\data\\json\\judgments-*'
RESULTS_DIR = 'results'
HELPERS_DIR = 'helpers'

create_dir_if_doesnt_exist(RESULTS_DIR)
create_dir_if_doesnt_exist(HELPERS_DIR)

FILENAMES_FOR_YEAR = os.path.join(HELPERS_DIR, f'judgments_from_{YEAR}.json')
UNIGRAMS_COUNTS_FILENAME = os.path.join(HELPERS_DIR, 'unigrams_counts.pkl')
BIGRAMS_COUNTS_FILENAME = os.path.join(HELPERS_DIR, 'bigrams_counts.pkl')
NUMBER_OF_RESULTS = 30
TAGGER_SERVER_URL = 'http://localhost:9200'

In [3]:
TaggingResult = namedtuple('TaggingResult', 'word tag')


def generate_unigrams_from_response(response):
    for line in response.split('\n'):
        if not line.startswith('\t'):
            continue
        _, word, tags, _ = line.split('\t')
        tag = tags.split(':')[0]
        yield TaggingResult(word.lower(), tag)


def generate_bigrams_from_response(response):
    yield from pairwise(generate_unigrams_from_response(response))


def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

In [4]:
year_pattern = str(YEAR) + '-\d{2}-\d{2}'


def get_filenames_with_judgments_for_year(judgment_filenames):
    with open(FILENAMES_FOR_YEAR, 'w') as file:
        filenames_for_year = filter(contains_judgments_from_year, judgment_filenames)
        json.dump(list(filenames_for_year), file)


def contains_judgments_from_year(filename):
    return any(re.match(year_pattern, item['judgmentDate']) for item in get_judgments(filename))


def get_judgments(filename):
    with open(filename, encoding='utf-8') as file:
        content = json.load(file)
    return (item for item in content['items'] if re.match(year_pattern, item['judgmentDate']))


def clear_text(text: str):
    return re.sub('-\n', '', BeautifulSoup(text).get_text())

In [5]:
def generate_bigrams(texts):
    for index, judgment_text in enumerate(texts):
        print(index)
        resp = requests.post(TAGGER_SERVER_URL, data=judgment_text.encode('utf-8'))
        yield from generate_bigrams_from_response(resp.text)


def generate_unigrams(texts):
    for index, judgment_text in enumerate(texts):
        print(index)
        resp = requests.post(TAGGER_SERVER_URL, data=judgment_text.encode('utf-8'))
        yield from generate_unigrams_from_response(resp.text)

In [6]:
def is_noun(tagging_result):
    return tagging_result.tag in ('subst', 'depr')  # , 'num', 'numcol')


def is_adjective(tagging_result):
    return tagging_result.tag in ('adj', 'adja', 'adjp', 'adjc')

In [7]:
def log_likelihood_ratio(bigram, *, word_counter, bigrams_counter, words_count, bigrams_count):
    first_word, second_word = bigram
    k11 = bigrams_counter[bigram] / bigrams_count
    k12 = word_counter[second_word] / words_count - k11
    k21 = word_counter[first_word] / words_count - k11
    k22 = 1 - (k12 + k21 - k11)
    matrix = np.matrix([
        [k11, k12],
        [k21, k22]
    ])
    return loglikelihood.llr(matrix)

In [8]:
all_judgment_filenames = glob.glob(PATH)

if not os.path.isfile(FILENAMES_FOR_YEAR):
    get_filenames_with_judgments_for_year(all_judgment_filenames)

with open(FILENAMES_FOR_YEAR) as file:
    filenames_with_judgments_for_year = json.load(file)
    judgments = itertools.chain.from_iterable(map(get_judgments, filenames_with_judgments_for_year))

judgment_texts = (clear_text(judgment['textContent']) for judgment in judgments)

In [9]:
if not os.path.isfile(UNIGRAMS_COUNTS_FILENAME):
    counter = Counter(generate_unigrams(judgment_texts))
    with open(UNIGRAMS_COUNTS_FILENAME, 'wb') as file:
        pickle.dump(counter, file)

with open(UNIGRAMS_COUNTS_FILENAME, 'rb') as file:
    unigrams_counter: Counter = pickle.load(file)

In [10]:
if not os.path.isfile(BIGRAMS_COUNTS_FILENAME):
    counter = Counter(generate_bigrams(judgment_texts))
    with open(BIGRAMS_COUNTS_FILENAME, 'wb') as file:
        pickle.dump(counter, file)

with open(BIGRAMS_COUNTS_FILENAME, 'rb') as file:
    bigrams_counter: Counter = pickle.load(file)

In [11]:
unigrams_count = sum(unigrams_counter.values())

In [12]:
bigrams_count = sum(bigrams_counter.values())

In [13]:
filtered_bigrams = {bigram for bigram in bigrams_counter
                    if is_noun(bigram[0]) and (is_noun(bigram[1]) or is_adjective(bigram[1]))}

In [14]:
func = functools.partial(log_likelihood_ratio, word_counter=unigrams_counter, bigrams_counter=bigrams_counter,
                         words_count=unigrams_count, bigrams_count=bigrams_count)
filtered_bigrams_with_llr = {key: func(key) for key in filtered_bigrams}
best_results = heapq.nlargest(NUMBER_OF_RESULTS, filtered_bigrams_with_llr.items(), key=operator.itemgetter(1))
for bigram, llr_value in best_results:
    print(bigram, llr_value)

(TaggingResult(word='zamówienie', tag='subst'), TaggingResult(word='publiczny', tag='adj')) 0.12023906530528296
(TaggingResult(word='sąd', tag='subst'), TaggingResult(word='wysoki', tag='adj')) 0.10748892403330568
(TaggingResult(word='sąd', tag='subst'), TaggingResult(word='okręgowy', tag='adj')) 0.10044403384405268
(TaggingResult(word='trybunał', tag='subst'), TaggingResult(word='konstytucyjny', tag='adj')) 0.09992386282213396
(TaggingResult(word='skarga', tag='subst'), TaggingResult(word='kasacyjny', tag='adj')) 0.08758262299041475
(TaggingResult(word='sąd', tag='subst'), TaggingResult(word='apelacyjny', tag='adj')) 0.08488378690037288
(TaggingResult(word='ubezpieczenie', tag='subst'), TaggingResult(word='społeczny', tag='adj')) 0.07424202479837948
(TaggingResult(word='zero', tag='subst'), TaggingResult(word='grosz', tag='subst')) 0.06968564432890756
(TaggingResult(word='przedmiot', tag='subst'), TaggingResult(word='zamówienie', tag='subst')) 0.0671478616649279
(TaggingResult(word='s

In [15]:
for bigram, llr_value in best_results:
    first, second = bigram
    print((first.word, second.word), llr_value)

('zamówienie', 'publiczny') 0.12023906530528296
('sąd', 'wysoki') 0.10748892403330568
('sąd', 'okręgowy') 0.10044403384405268
('trybunał', 'konstytucyjny') 0.09992386282213396
('skarga', 'kasacyjny') 0.08758262299041475
('sąd', 'apelacyjny') 0.08488378690037288
('ubezpieczenie', 'społeczny') 0.07424202479837948
('zero', 'grosz') 0.06968564432890756
('przedmiot', 'zamówienie') 0.0671478616649279
('skarga', 'konstytucyjny') 0.06605923689692522
('sąd', 'rejonowy') 0.0644495373232521
('urząd', 'zamówienie') 0.0643901805074235
('skarb', 'państwo') 0.062092661498156586
('ustawa', 'pzp') 0.060571773544677646
('warunek', 'udział') 0.059437307091861444
('działalność', 'gospodarczy') 0.059301122291177503
('prawo', 'zamówienie') 0.05758634784275847
('dzień', '29') 0.05731238532615411
('wyrok', 'sąd') 0.05708039643880093
('specyfikacja', 'istotny') 0.05697121094800172
('posiedzenie', 'niejawny') 0.054178614753456866
('izba', 'odwoławczy') 0.05304385145838533
('rzeczpospolita', 'polski') 0.05265629