In [1]:
!pip install python-crfsuite
import pycrfsuite
import json
import string
import random

global folder
folder = "/content/drive/MyDrive/Afstudeerproject/"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
class Featurizer():
    def __init__(self, window=2, check_punct=False, position=False, 
                 check_wordlist=False, check_namelist=False, check_titlelist=False,
                 add_token=False):
        self.window = window
        self.check_punct = check_punct
        self.position = position
        self.check_wordlist = check_wordlist
        self.check_namelist = check_namelist
        self.check_titlelist = check_titlelist
        self.add_token = add_token
        self.punctuation = string.punctuation
        self.randnumbers = ["Rn", "Rdnr", "Rdn", "Rz", "RdNr", "Rdz"]
        self.standard_features = {
                "caps": "None",
                "digits": "None",
                "is_four_digits": False,
                "isAufl": False,
                "isRand": False,
                "isIn": False,
                "isPar": False,
                "isBOS": False,
                "isEOS": False,
                "hasPunct": False,
            }

        if check_wordlist:
            with open(folder + "Data/Wordlists/wordlist.txt", "r") as f:
                self.wordlist = f.readlines()
                self.wordlist = [word.replace("\n", "") for word in self.wordlist]

        if check_namelist:
            with open(folder + "Data/Wordlists/known_names.txt", "r") as f:
                self.namelist = f.readlines()
                self.namelist = [name.replace("\n", "") for name in self.namelist]
            
            with open(folder + "Data/Wordlists/surnames.txt", "r") as f:
                self.surnames = f.readlines()
                self.surnames = [name.replace("\n", "") for name in self.surnames]
        
        if check_titlelist:
            with open(folder + "Data/Wordlists/only_titles.txt", "r") as f:
                self.titlelist = f.readlines()
                self.titlelist = [title.replace("\n", "") for title in self.titlelist]
    
    
    def featurize_citation(self, citation, parsed=[]):
        labeling = False
        if parsed:
            labeling = True
        citation = citation.split(" ")
        citation = ["BOS"] * self.window + citation + ["EOS"] * self.window
        featurized = []
        for i, token in enumerate(citation):
            if token == "" or token == " ":
                continue
            elif "," in token and token != ",":
                citation.insert(i + 1, ",")
                token = token.replace(",", "")
            token_features = self.standard_features.copy()
            if labeling:
                if token == "," and random.random() < 0.25:
                    continue
                for parsed_token in parsed:
                    if token == parsed_token[0]:
                        token_features["label"] = parsed_token[1]
                        parsed.remove(parsed_token)
                        break
                if "label" not in token_features:
                    token_features["label"] = "other"
            
            if self.add_token:
                token_features["token"] = token

            if self.position:
                 token_features["position"] = (i - self.window)/(len(citation) - self.window * 2)
            
            token_features["length"] = len(token)
            
            self.check_chars(token, token_features)
            self.check_token(token, token_features)
            self.check_lists(token, token_features)

            featurized.append(token_features)
    
        featurized = self.add_window(featurized)
        
        return featurized


    def check_chars(self, token, token_features):
        caps = 0
        digits = 0
        previous_char = "none"
        for char in token:
            if char.isdigit():
                digits += 1
            elif char.isupper():
                caps += 1
            elif char in self.punctuation:
                token_features["hasPunct"] = True
            

        if digits > 0 and digits < len(token):
            token_features["digits"] = "mixed"
        elif digits == len(token):
            token_features["digits"] = "all"
        if digits == 4:
            token_features["is_four_digits"] = True
        
        if token[0].isupper() and caps == 1:
            token_features["caps"] = "first"
        elif caps > 1 and caps < len(token):
            token_features["caps"] = "mixed"
        elif caps == len(token):
            token_features["caps"] = "all"


    def check_token(self, token, token_features):
        if "Aufl" in token:
            token_features["isAufl"] = True
        elif token.replace(".", "") in self.randnumbers:
            token_features["isRand"] = True
        elif token.replace(":", "") == "in":
            token_features["isIn"] = True
        elif "§" in token:
            token_features["isPar"] = True
        elif token == "BOS":
            token_features["isBOS"] = True
        elif token == "EOS":
            token_features["isEOS"] = True
        
        return True

    
    def check_lists(self, token, token_features):
        if self.check_wordlist:
            token_features["inWordlist"] = False
            if token.lower() in self.wordlist:
                token_features["inWordlist"] = True
        
        if self.check_namelist:
            token_features["inNamelist"] = False
            if token in self.namelist:
                token_features["inNamelist"] = True
        
        if self.check_titlelist:
            token_features["inTitlelist"] = False
            if token in self.titlelist:
                token_features["inTitlelist"] = True
        
        return True


    def add_window(self, featurized):
        windowed = []
        for i in range(self.window, len(featurized)-self.window):
            token_features = featurized[i].copy()
            for j in range(-self.window, self.window + 1):
                if j == 0:
                    continue
                window_token = featurized[i + j].copy()
                for feature in window_token:
                    if feature == "token" or feature == "label" or feature == "position":
                        continue
                    token_features[f"{j}:{feature}"] = window_token[feature]
            windowed.append(token_features)

        return windowed

In [2]:
tagger = pycrfsuite.Tagger()
tagger.open(folder + "Models/crf.model")

<contextlib.closing at 0x7fc5e7a91210>

In [3]:
with open(folder + "Data/four_block.txt", "r") as f:
    all_citations = f.readlines()

with open(folder + "Data/three_block.txt", "r") as f:
    all_citations += f.readlines()

all_citations = [citation.replace("\n", "") for citation in all_citations]

In [4]:
citation_featurizer = Featurizer(check_punct=True,
                 check_wordlist=True, check_namelist=True, check_titlelist=True,
                 add_token=True)

NameError: ignored

In [None]:
from tabulate import tabulate
from tqdm.notebook import tqdm

In [None]:
with open(folder + "Data/auflage_data.txt") as f:
    all_citations = f.readlines()

all_citations = [citation.replace("\n", "") for citation in all_citations]

In [None]:
with open(folder + "Data/known_name_citations.txt", "r") as f:
    lines = f.readlines()

with open(folder + "Data/auflage_data.txt", "r") as f:
    lines += f.readlines()
    lines = [name.replace("\n", "") for name in lines]
    lines = set(lines)
    lines = list(lines)
    print(len(lines))

all_citations = lines

In [None]:
all_citations = ["Musielak/Heinrich, ZPO, 7. Aufl. 2009, § 32 Rn. 1"]

In [None]:
all = []
for i in tqdm(range(len(all_citations))):
    citation = all_citations[i]
    json_citation = {"citation": citation}
    labelled = {"title": [], "author": [], "editor": [], "edition": [], "year": [], "entry": [], "other": []}
    if citation.count("/") > 0:
        citation = citation.replace("/", " ")
    else:
        citation = citation.replace("-", " ")
    featurized = citation_featurizer.featurize_citation(citation)
    tokens = []
    for token in featurized:
        print(token)
        tokens.append(token["token"])
    y_pred = tagger.tag(featurized)
    for token, tag in zip(tokens, y_pred):
        if tag in labelled:
            labelled[tag].append(token)
    for label in labelled:
        if label == "author" or label == "editor":
            labelled[label] = "/".join(labelled[label])
        elif label == "edition" and len(labelled[label]) > 0:
            if "Auf" in labelled[label]:
                if len(labelled[label]) == 1:
                    labelled[label] = labelled[label][0].split(".")
                    labelled[label][0] = labelled[label][0] + "."
                if labelled[label][0][0].isdigit():
                    labelled[label] = labelled[label][0] + " Auflage"
                else:
                    labelled[label] = ""
        elif label == "entry":
            for poss in ["M.W.N", "m.w.N", "m.", "M.", "w.", "W.", "n.", "N.", "m.w.N.", "M.W.N."]:
                if poss in labelled[label]:
                    labelled[label].remove(poss)
            # par = ""
            # if '§' in labelled[label]:
            #     par = labelled[label].index('§')
            #     par = labelled[label][par + 1]
            # rand = ""
            # for j, token in enumerate(labelled[label]):
            #     if "R" in token:
            #         if j != len(labelled[label]) - 1:
            #             rand = labelled[label][j + 1]
            #         else:
            #             rand = labelled[label][j]
            #         break
            # print(labelled[label], par, rand)
            labelled[label] = " ".join(labelled[label])
        else:
            labelled[label] = " ".join(labelled[label])
    json_citation["labels"] = labelled
    all.append(json_citation)


In [None]:
all[0]

In [None]:
with open(folder + 'Data/labelled_data.json', 'w') as outfile:
    json.dump(all, outfile)