In [3]:
import glob
import itertools
import json
import operator
import os
import pickle
import re
from collections import Counter, namedtuple, defaultdict

import nltk
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [4]:
def create_dir_if_doesnt_exist(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


YEAR = 2009
PATH = 'F:\\saos\\data\\json\\judgments-*'
RESULTS_DIR = 'results'
HELPERS_DIR = 'helpers'

create_dir_if_doesnt_exist(RESULTS_DIR)
create_dir_if_doesnt_exist(HELPERS_DIR)

FILENAMES_FOR_YEAR = os.path.join(HELPERS_DIR, f'judgments_from_{YEAR}.json')
WORD_COUNTS_FILENAME = os.path.join(HELPERS_DIR, f'word_counts_{YEAR}.json')
TAGGED_TEXTS = os.path.join(HELPERS_DIR, f'tagged_texts_{YEAR}.pkl')
NUMBER_OF_RESULTS = 30
TEST_SIZE = 0.25
TAGGER_SERVER_URL = 'http://localhost:9200'

In [5]:
year_pattern = str(YEAR) + '-\d{2}-\d{2}'


def get_filenames_with_judgments_for_year(judgment_filenames):
    with open(FILENAMES_FOR_YEAR, 'w') as file:
        filenames_for_year = filter(contains_judgments_from_year, judgment_filenames)
        json.dump(list(filenames_for_year), file)


def contains_judgments_from_year(filename):
    return any(re.match(year_pattern, item['judgmentDate']) for item in get_judgments(filename))


def get_judgments(filename):
    with open(filename, encoding='utf-8') as file:
        content = json.load(file)
    return (item for item in content['items']
            if re.match(year_pattern, item['judgmentDate']) and item['courtType'].upper() in ('COMMON', 'SUPREME')
            and 'Uzasadnienie' in item['textContent'])


def clear_text(text: str, most_common_words, is_original=True):  # False for tagged texts
    start_word = 'Uzasadnienie' if is_original else 'uzasadnienie'
    text = skip_part_before(text, start_word)
    html_clear_text = BeautifulSoup(text, 'lxml').get_text()
    return ' '.join(filter(
        lambda word: word not in most_common_words,
        tokenize(re.sub('-\n', '', html_clear_text)))
    )


def skip_part_before(text, start_word):
    start_word_index = text.find(start_word) + len(start_word) + 1
    return text[start_word_index:]


def tokenize(text):
    yield from filter(is_word, map(str.lower, nltk.word_tokenize(text, language='polish')))

    
def is_word(word: str):
    return word.isalpha() and len(word) > 1

In [6]:
TaggingResult = namedtuple('TaggingResult', 'word tag')


def generate_unigrams_from_response(response):
    for line in response.split('\n'):
        if not line.startswith('\t'):
            continue
        _, word, tags, _ = line.split('\t')
        tag = tags.split(':')[0]
        yield TaggingResult(word.lower(), tag)

        
def generate_tagged_texts(judgments):
    texts = map(operator.itemgetter('textContent'), judgments)
    for index, judgment_text in enumerate(texts):
        print(index)
        resp = requests.post(TAGGER_SERVER_URL, data=judgment_text.encode('utf-8'))
        if not resp.ok:
            yield None
        else:
            result = ' '.join(map(operator.attrgetter('word'), generate_unigrams_from_response(resp.text)))
            with open(os.path.join('tagged', f'{index}.txt'), 'w', encoding='utf-8') as file:
                file.write(result)
            yield result

In [7]:
all_judgment_filenames = glob.glob(PATH)

if not os.path.isfile(FILENAMES_FOR_YEAR):
    get_filenames_with_judgments_for_year(all_judgment_filenames)

with open(FILENAMES_FOR_YEAR) as file:
    filenames_with_judgments_for_year = json.load(file)
    judgments = list(itertools.chain.from_iterable(map(get_judgments, filenames_with_judgments_for_year)))

In [8]:
if not os.path.isfile(TAGGED_TEXTS):    
    tagged_judgment_texts = list(generate_tagged_texts(judgments))
    with open(TAGGED_TEXTS, 'wb') as file:
        pickle.dump(tagged_judgment_texts, file)
        
with open(TAGGED_TEXTS, 'rb') as file:
    tagged_judgment_texts = pickle.load(file)

In [9]:
None in tagged_judgment_texts

False

In [10]:
judgments = list(itertools.compress(judgments, tagged_judgment_texts))
tagged_judgment_texts = [text for text in tagged_judgment_texts if text is not None]

In [11]:
def get_words_counter():
    with open(WORD_COUNTS_FILENAME) as file:
        word_counter = Counter(json.load(file))
    return word_counter


def get_most_common_words(k=20):
    counter = get_words_counter()
    return list(map(operator.itemgetter(0), counter.most_common(k)))

In [12]:
most_common_words = get_most_common_words()

In [13]:
most_common_words

['w',
 'z',
 'i',
 'na',
 'do',
 'nie',
 'o',
 'że',
 'przez',
 'ust',
 'się',
 'dnia',
 'jest',
 'a',
 'oraz',
 'ustawy',
 'od',
 'sąd',
 'nr',
 'postępowania']

In [14]:
Text = namedtuple('Text', ['original_form', 'base_form'])


def extract_signature(item):
    signature = item['courtCases'][0]['caseNumber'].split(' ')
    if len(signature) == 2:
        return signature[0]
    return signature[1]
    
    
def get_group(signature):
    # A?C.* - sprawy cywilne
    # A?U.* - sprawy z zakresu ubezpieczenia społecznego
    # A?K.* - sprawy karne
    # G.* - sprawy gospodarcze
    # A?P.* - sprawy w zakresie prawa pracy
    # R.* - sprawy w zakresie prawa rodzinnego
    # W.* - sprawy o wykroczenia
    # Am.* - sprawy w zakresie prawa konkurencji
    regexes = {
        'civil': 'A?C.*',
        'social_security': 'A?U.*',
        'penal': 'A?K.*',
        'economy': 'G.*',
        'employment': 'A?P.*',
        'family': 'R.*',
        'offence': 'W.*',
        'competition': 'Am.*'
    }
    for case_type, regex in regexes.items():
        if re.match(regex, signature):
            return case_type
    return None

def create_groups(judgments, tagged_judgment_texts, most_common_words):
    groups = defaultdict(list)
    unmatched = 0
    for judgment, tagged_judgment_text in zip(judgments, tagged_judgment_texts):
        signature = extract_signature(judgment)
        group = get_group(signature)
        if group:
            original_form_text = clear_text(judgment['textContent'], most_common_words)
            base_form_text = clear_text(tagged_judgment_text, most_common_words, is_original=False)      
            groups[group].append(Text(original_form_text, base_form_text))
        else:
            unmatched += 1
            print('Group not found for', signature, judgment['courtCases'])
    print(unmatched, 'unmatched judgments')
    return groups

In [15]:
groups = create_groups(judgments, tagged_judgment_texts, most_common_words)

Group not found for SNO [{'caseNumber': 'SNO 25/09'}]
Group not found for SNO [{'caseNumber': 'SNO 10/09'}]
Group not found for SPP [{'caseNumber': 'III SPP 14/09'}]
Group not found for SK [{'caseNumber': 'III SK 4/09'}]
Group not found for SK [{'caseNumber': 'III SK 5/09'}]
Group not found for SNO [{'caseNumber': 'SNO 59/09'}]
Group not found for BU [{'caseNumber': 'I BU 10/09'}]
Group not found for SNO [{'caseNumber': 'SNO 88/09'}]
Group not found for BP [{'caseNumber': 'II BP 12/09'}]
Group not found for SK [{'caseNumber': 'III SK 30/09'}]
Group not found for SK [{'caseNumber': 'III SK 34/09'}]
Group not found for SK [{'caseNumber': 'III SK 36/09'}]
12 unmatched judgments


In [16]:
for case_type, texts in groups.items():
    print(case_type, len(texts))

social_security 66
employment 85
civil 1439
penal 12


In [17]:
# load groups dict for judgments from 2010 to get better classification results
with open(os.path.join(HELPERS_DIR, 'groups_2010.pkl'), 'rb') as file:
    groups_2010 = pickle.load(file)
    
for case_type, texts in groups_2010.items():
    print(case_type, len(texts))
    groups[case_type].extend(texts)

employment 181
civil 1662
social_security 224
penal 17
competition 3


In [18]:
groups = {case_type: texts for case_type, texts in groups.items() if len(texts) > 99}

In [19]:
for case_type, texts in groups.items():
    print(case_type, len(texts))

social_security 290
employment 266
civil 3101


In [20]:
sum(len(texts) for texts in groups.values())

3657

In [21]:
def print_results(y_test, predictions, average):
    print(average, 'average')
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average=average)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', f1)

    
def train_classifiers(x_train, x_test, y_train, y_test, form):
    assert form in ('original_form', 'base_form')
    train_x = [getattr(text, form) for text in x_train]
    test_x = [getattr(text, form) for text in x_test]
    for group, group_texts in groups.items():
        classifier = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
        ])
        train_y = [int(y == group) for y in y_train]
        test_y = [int(y == group) for y in y_test]
        classifier.fit(train_x, train_y)
        predictions = classifier.predict(test_x)
        print('Group:', group)
        print('Group size:', sum(len(getattr(text, form).split(' ')) for text in group_texts), 'words')
        print_results(test_y, predictions, average='micro')
        print_results(test_y, predictions, average='macro')
        print()
        print()

In [22]:
labels = list(itertools.chain.from_iterable([case_type for _ in range(len(texts))] for case_type, texts in groups.items())) 
judgment_texts = list(itertools.chain.from_iterable(texts for texts in groups.values()))
x_train, x_test, y_train, y_test = train_test_split(judgment_texts, labels, test_size=TEST_SIZE)

In [23]:
print('Original texts')
train_classifiers(x_train, x_test, y_train, y_test, form='original_form')

Original texts
Group: social_security
Group size: 360409 words
micro average
Precision: 0.9901639344262295
Recall: 0.9901639344262295
F1: 0.9901639344262295
macro average
Precision: 0.9774241414814366
Recall: 0.9616139138582263
F1: 0.9693534641563922


Group: employment
Group size: 495362 words
micro average
Precision: 0.9715846994535519
Recall: 0.9715846994535519
F1: 0.9715846994535519
macro average
Precision: 0.9578282378108566
Recall: 0.8384984183471729
F1: 0.8875411261959687


Group: civil
Group size: 2574854 words
micro average
Precision: 0.9770491803278688
Recall: 0.9770491803278688
F1: 0.9770491803278688
macro average
Precision: 0.9804899330118402
Recall: 0.9369976278649077
F1: 0.9570949136874258




In [24]:
print('Tagged texts')
train_classifiers(x_train, x_test, y_train, y_test, form='base_form')

Tagged texts
Group: social_security
Group size: 414021 words
micro average
Precision: 0.9923497267759562
Recall: 0.9923497267759562
F1: 0.9923497267759562
macro average
Precision: 0.9843513415298892
Recall: 0.9683117149298743
F1: 0.9761638054549717


Group: employment
Group size: 561540 words
micro average
Precision: 0.9748633879781421
Recall: 0.9748633879781421
F1: 0.9748633879781421
macro average
Precision: 0.9689511009937399
Recall: 0.8529804270462633
F1: 0.9012551319648094


Group: civil
Group size: 2993096 words
micro average
Precision: 0.9814207650273225
Recall: 0.9814207650273225
F1: 0.9814207650273225
macro average
Precision: 0.9832178789378195
Recall: 0.9499846408519208
F1: 0.9656489813945784


