# AutoCom: Compute ranking score for citing sentences

In [15]:
# Install deps
!pip install nltk
!pip install git+https://github.com/openlegaldata/legal-reference-extraction.git#egg=legal-reference-extraction
!pip install spacy

Collecting legal-reference-extraction from git+https://github.com/openlegaldata/legal-reference-extraction.git#egg=legal-reference-extraction
  Cloning https://github.com/openlegaldata/legal-reference-extraction.git to /tmp/pip-install-uh42eutc/legal-reference-extraction
Collecting nltk==3.2.2 (from legal-reference-extraction)
[?25l  Downloading https://files.pythonhosted.org/packages/13/ce/cba8bf82c8ab538d444ea4ab6f4eb1d80340c7b737d7a8d1f08b429fccae/nltk-3.2.2.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 7.8MB/s eta 0:00:01
Building wheels for collected packages: legal-reference-extraction, nltk
  Running setup.py bdist_wheel for legal-reference-extraction ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-45ab2tye/wheels/78/e2/14/308a1aa4449610f017210056c635ef23ee7493c22ddeac5831
  Running setup.py bdist_wheel for nltk ... [?25ldone
[?25h  Stored in directory: /home/debian/.cache/pip/wheels/64/db/e2/39e07b414a807d7aa0350c58417f61fd8654eca

In [9]:
# Change path to your data dir
data_dir = '../data/'

In [12]:
import json
import re
import locale
import os
import sys

import spacy

In [3]:
# OLDP deps
from utils import preprocessing
from refex.extractor import RefExtractor

law_extractor = RefExtractor()
law_extractor.do_law_refs = True
law_extractor.do_case_refs = False


case_extractor = RefExtractor()
case_extractor.do_law_refs = False
case_extractor.do_case_refs = True

lit_pattern = re.compile('(NJW|MDR)\s?([0-9]{2,4})')


In [47]:
# Test ref ex
test_text = 'Kammer, namentlich auch die Feststellungen zu den Folgen, die dem   (§§ 288, 291 BGB) entfallen lässt (grundlegend BGH Urteil vom 24. Juni 1981 - IVa ZR 104/80 - NJW 1981, 2244 f.; BGH Urteile vom 7. Oktober 1982 -VII ZR 163/81 - WM 1983, 21, 22 und vom 15. März 2012 - IX ZR 35/11 - NJW2012, 1717 Rn. 11 mwN; vgl. auch BAG NZA 2008, 757 Rn. 16).  bei einem längeren Verbleib in Tunesien drohen, gebunden (vgl. § 42 Satz 1 AsylG)'
_, markers = law_extractor.extract(test_text)
markers

[<RefMarker({'text': '§§ 288, 291 BGB', 'start': 69, 'end': 84, 'line': '', 'uuid': UUID('dc89aec9-32cd-48d9-b575-4108cc648500'), 'references': [<Ref(law: bgb/288)>, <Ref(law: bgb/291)>]})>,
 <RefMarker({'text': '§ 42 Satz 1 AsylG', 'start': 411, 'end': 428, 'line': '', 'uuid': UUID('09e3475a-0d73-4ec1-8afa-d7b92bd1a391'), 'references': [<Ref(law: asylg/42)>]})>]

In [52]:

lit_pattern.findall(test_text)

[('NJW', '1981'), ('NJW', '2012')]

## Spacy Text Classification

In [22]:
# Load model
model_path = os.path.join(data_dir, 'autocom_model')
nlp = spacy.load(model_path)
predict_label = 'POSITIVE'

In [23]:
# Predict
doc = nlp(test_text)
doc.cats['POSITIVE']

0.9598762392997742

In [53]:
# Load cases from dump file (alternatively we could get cases from API: search for "Streitwert")
file_path = os.path.join(data_dir, 'courtDecisionsPreprocessed.json')
n = 10000

book = 'BGB'
number = '439'

sents = []
sents_dict = {}
out = []


with open(file_path, 'r') as f:
    for case_json in [next(f) for x in range(n)]:  # Read line-by-line the first n lines (one case per line)
        case = json.loads(case_json)  # Parse JSON
        has_score = False
        
        for fs_i, fs in enumerate(case['fundstellen']):  # Iterate over citations
        #for fs in case['fundstellen']:  # Iterate over citations
            
            if 'gesetze' in fs:
                for g in fs['gesetze']:
                    # Test if it refers to the requested law
                    if 'book' in g and 'number' in g and g['book'] == book and g['number'] == number:
                        sent = fs['sentencesRechtssatz']
                        
                        if sent in sents_dict:
                            continue
                        else:
                            sents_dict[sent] = 1
                        
                        _, case_markers = case_extractor.extract(sent)
                        _, law_markers = law_extractor.extract(sent)
                        
                        law_cits = [[r for r in m.references] for m in law_markers]
                        lit_cits = lit_pattern.findall(sent)
                        
                        pos = case['text'].find(sent)
                        case_length = len(case['text'])
                        rel_pos = pos / case_length
                        law_cit_count =  min(len(law_cits) - 1, 0)  # Do not count itself
                        
                        sent_data = {
                            'case_cit_count': len(case_markers), 
                            'law_cit_count': law_cit_count,
                            'lit_cit_count': len(lit_cits),
                            'pos': pos, 
                            'rel_pos': rel_pos,
                            'case_length': case_length,
                            'score': law_cit_count,
                            'text': sent,
                            'cat': nlp(sent).cats['POSITIVE']
                        }
                        
                        sents.append(sent_data)
                        
                        case['fundstellen'][fs_i]['rank_score'] = sent_data['score']
                        
                        has_score = True
        if has_score:
            out.append(json.dumps(case))
            
        #print('###########')
    
        
        
print('done')

done


In [51]:
# Sort sentences by score        
sorted_sents = sorted(sents, key=lambda k: k['cat'], reverse=True) 

print(json.dumps(sorted_sents, indent=4))

[
    {
        "case_cit_count": 1,
        "law_cit_count": 0,
        "lit_cit_count": 1,
        "pos": -1,
        "rel_pos": -4.575611988103409e-05,
        "case_length": 21855,
        "score": 0,
        "text": "Die Nichterf\u00fcllung des Nacherf\u00fcllungsanspruchs aus \u00a7 439 Abs. 1 BGB stellt in diesem Fall keine Pflichtverletzung im Sinne des \u00a7 280 Abs. 1 und 3, \u00a7 281 Abs. 1 BGB (i.V. m. \u00a7 437 Nr. 3 BGB) dar, denn \u00a7 280 Abs. 1 und 3, \u00a7 281 Abs. 1 BGB setzen voraus, dass dem Gl\u00e4ubiger ein Anspruch aus dem Schuldverh\u00e4ltnis zusteht, der nicht durch eine dauernde oder aufschiebende Einrede gehemmt ist (vgl. BGH, Urteil vom 7. M\u00e4rz 2013 -VII ZR 162/12, NJW2013, 1431 Rn. 20).",
        "cat": 0.9999405145645142
    },
    {
        "case_cit_count": 2,
        "law_cit_count": 0,
        "lit_cit_count": 1,
        "pos": -1,
        "rel_pos": -3.335779571685903e-05,
        "case_length": 29978,
        "score": 0,
        "text": 

In [54]:
len(sorted_sents)

17

In [55]:
with open(os.path.join(data_dir, 'cases_with_scores.jsonl'), 'w') as f:
    f.write('\n'.join(out))