In [13]:
import glob
import heapq
import itertools
import json
import math
import operator
import os
import re
from collections import Counter
from functools import partial

import loglikelihood
import nltk
import numpy as np
from bs4 import BeautifulSoup

In [14]:
def create_dir_if_doesnt_exist(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


YEAR = 2009
PATH = 'F:\\saos\\data\\json\\judgments-*'
HELPERS_DIR = 'helpers'

create_dir_if_doesnt_exist(HELPERS_DIR)

FILENAMES_FOR_YEAR = os.path.join(HELPERS_DIR, f'judgments_from_{YEAR}.json')
WORD_COUNTS_FILENAME = os.path.join(HELPERS_DIR, 'counts.json')
BIGRAMS_COUNTS_FILENAME = os.path.join(HELPERS_DIR, 'bigrams_counts.json')
NUMBER_OF_RESULTS = 30

In [15]:
year_pattern = str(YEAR) + '-\d{2}-\d{2}'

def get_filenames_with_judgments_for_year(judgment_filenames):
    with open(FILENAMES_FOR_YEAR, 'w') as file:
        filenames_for_year = filter(contains_judgments_from_year, judgment_filenames)
        json.dump(list(filenames_for_year), file)


def contains_judgments_from_year(filename):
    return any(re.match(year_pattern, item['judgmentDate']) for item in get_judgments(filename))


def get_judgments(filename):
    with open(filename, encoding='utf-8') as file:
        content = json.load(file)
    return (item for item in content['items'] if re.match(year_pattern, item['judgmentDate']))


def clear_text(text: str):
    return re.sub('-\n', '', BeautifulSoup(text).get_text())


def tokenize_and_filter(text):
    yield from filter(is_word, map(str.lower, nltk.word_tokenize(text, language='polish')))


def is_vowel(letter):
    return letter in ('a', 'ą', 'e', 'ę', 'i', 'o', 'ó', 'u', 'y')


def is_word(word: str):
    return word.isalpha()

In [16]:
def generate_words(texts):
    for judgment_text in texts:
        yield from tokenize_and_filter(judgment_text)


def generate_bigrams(texts):
    yield from nltk.bigrams(generate_words(texts))

In [17]:
def pointwise_mutual_information(bigram, *, word_counter, bigrams_counter, words_count, bigrams_count):
    first_word, second_word = bigram
    bigram_prob = bigrams_counter[bigram] / bigrams_count
    first_word_prob = word_counter[first_word] / words_count
    second_word_prob = word_counter[second_word] / words_count
    return math.log(bigram_prob / (first_word_prob * second_word_prob))

In [18]:
def log_likelihood_ratio(bigram, *, word_counter, bigrams_counter, words_count, bigrams_count):
    first_word, second_word = bigram
    k11 = bigrams_counter[bigram] / bigrams_count
    k12 = word_counter[second_word] / words_count - k11
    k21 = word_counter[first_word] / words_count - k11
    k22 = 1 - (k12 + k21 - k11)
    matrix = np.matrix([
        [k11, k12],
        [k21, k22]
    ])
    return loglikelihood.llr(matrix)

In [19]:
all_judgment_filenames = glob.glob(PATH)

if not os.path.isfile(FILENAMES_FOR_YEAR):
    get_filenames_with_judgments_for_year(all_judgment_filenames)

with open(FILENAMES_FOR_YEAR) as file:
    filenames_with_judgments_for_year = json.load(file)
    judgments = itertools.chain.from_iterable(map(get_judgments, filenames_with_judgments_for_year))

judgment_texts = (clear_text(judgment['textContent']) for judgment in judgments)

In [20]:
if not os.path.isfile(WORD_COUNTS_FILENAME):
    counter = Counter(generate_words(judgment_texts))
    sorted_counter = dict(sorted(counter.items(), key=operator.itemgetter(1), reverse=True))
    with open(WORD_COUNTS_FILENAME, 'w') as file:
        json.dump(sorted_counter, file)

with open(WORD_COUNTS_FILENAME) as file:
    word_counter = json.load(file)

In [21]:
if not os.path.isfile(BIGRAMS_COUNTS_FILENAME):
    counter = Counter(generate_bigrams(judgment_texts))
    sorted_counter = dict(sorted(counter.items(), key=operator.itemgetter(1), reverse=True))
    with open(BIGRAMS_COUNTS_FILENAME, 'w') as file:
        json_to_dump = [{'key': key, 'value': value} for key, value in sorted_counter.items()]
        json.dump(json_to_dump, file)

with open(BIGRAMS_COUNTS_FILENAME) as file:
    bigrams_counter = {tuple(item['key']): item['value'] for item in json.load(file)}

In [22]:
words_count = sum(word_counter.values())
bigrams_count = sum(bigrams_counter.values())

In [23]:
def get_results(bigrams_counter, func):
    bigrams_with_func_values = {key: func(key) for key in bigrams_counter}
    print(func.func.__name__)
    for bigram, func_value in heapq.nlargest(NUMBER_OF_RESULTS, bigrams_with_func_values.items(), key=operator.itemgetter(1)):
        print(bigram, func_value)

In [24]:
func = partial(pointwise_mutual_information, word_counter=word_counter, bigrams_counter=bigrams_counter, 
              words_count=words_count, bigrams_count=bigrams_count)
get_results(bigrams_counter, func)

pointwise_mutual_information
('napawania', 'łukowego') 16.09302540207235
('przyjeżdżają', 'mochody') 16.09302540207235
('systematycznością', 'stabilnością') 16.09302540207235
('osadnika', 'śużla') 16.09302540207235
('puhb', 'cewogaz') 16.09302540207235
('pre', 'fabrykat') 16.09302540207235
('uścikowiec', 'oborniki') 16.09302540207235
('transmisjami', 'piłkarskimi') 16.09302540207235
('rozjaśnione', 'rozbielone') 16.09302540207235
('diagności', 'laboratoryjni') 16.09302540207235
('histochemicznych', 'immunopatologicznych') 16.09302540207235
('immunopatologicznych', 'mikroskopii') 16.09302540207235
('przydomowego', 'ogródka') 16.09302540207235
('przetwórcom', 'rybnym') 16.09302540207235
('książeczką', 'wkładową') 16.09302540207235
('wkładową', 'walutową') 16.09302540207235
('walutową', 'książeczka') 16.09302540207235
('kobylarnia', 'brzoza') 16.09302540207235
('societe', 'anonyme') 16.09302540207235
('navigation', 'aerienne') 16.09302540207235
('aerienne', 'sabena') 16.09302540207235
('r

In [25]:
func = partial(log_likelihood_ratio, word_counter=word_counter, bigrams_counter=bigrams_counter, 
              words_count=words_count, bigrams_count=bigrams_count)
get_results(bigrams_counter, func)

log_likelihood_ratio
('z', 'dnia') 0.18531051883654623
('zamówień', 'publicznych') 0.14532207123628244
('na', 'podstawie') 0.13198481579815652
('sygn', 'akt') 0.11827442509884811
('ust', 'pkt') 0.1151114664099067
('zgodnie', 'z') 0.10711877035704384
('sąd', 'najwyższy') 0.1036572114616913
('w', 'dniu') 0.09969245989930654
('sp', 'z') 0.09783296514906864
('trybunał', 'konstytucyjny') 0.09664021601960636
('związku', 'z') 0.0960044613352229
('gr', 'słownie') 0.0931042117594651
('ust', 'ustawy') 0.09273527783356715
('sądu', 'najwyższego') 0.09026066773749081
('ustawy', 'pzp') 0.08959393587128982
('na', 'rzecz') 0.0894411150589385
('przez', 'zamawiającego') 0.08931707480307122
('w', 'związku') 0.08842643579657747
('sądu', 'okręgowego') 0.08691197935424622
('sąd', 'okręgowy') 0.08564659055017163
('w', 'sprawie') 0.08529106041886966
('w', 'postępowaniu') 0.08472758291847376
('otk', 'zu') 0.08373242192961379
('urzędu', 'zamówień') 0.08328394061297413
('zero', 'groszy') 0.08223610834567639
('tr

In [29]:
func = partial(pointwise_mutual_information, word_counter=word_counter, bigrams_counter=bigrams_counter, 
              words_count=words_count, bigrams_count=bigrams_count)
get_results({key: value for key, value in bigrams_counter.items() if value >= 1000}, func)

pointwise_mutual_information
('trybunale', 'konstytucyjnym') 8.678131611827096
('równego', 'traktowania') 8.518284157328502
('punktu', 'widzenia') 8.444136037340279
('przede', 'wszystkim') 8.348890641119182
('przetargu', 'nieograniczonego') 8.247823342260315
('związania', 'ofertą') 8.150378911979955
('nakazuje', 'zaliczyć') 8.113752846202376
('siedemdziesiąt', 'cztery') 8.06168027418893
('działalność', 'gospodarczą') 8.025352473526059
('dochodów', 'własnych') 8.02506028661144
('rzeczypospolitej', 'polskiej') 7.970345247712421
('izbie', 'cywilnej') 7.957234659629259
('dalszego', 'biegu') 7.931726960159475
('sądów', 'powszechnych') 7.926927000094623
('posiedzeniu', 'niejawnym') 7.917434430614341
('rachunku', 'dochodów') 7.810191297010222
('uczciwej', 'konkurencji') 7.787196262491714
('nieuczciwej', 'konkurencji') 7.781557579167587
('otk', 'zu') 7.759022601084957
('cztery', 'złote') 7.75102660406432
('zero', 'groszy') 7.748815740870011
('złote', 'zero') 7.728555113649864
('ubezpieczeń', '