In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
%%time
# read error data
import re

def parse_line(line):
    # the wordform starts with *
    # corrections start with # (there can be multiple) and end with whitspace or ~
    # example text start with ~
    # 'rules' start with <space>[
    
    # get the wordform
    wf_regex = r'^\*(?P<wf>.+?)[\t#]'
    m = re.match(wf_regex, line)
    wf = m.group('wf')
    # Wordforms need to be stripped! 
    # Whitespace before or after wordforms also leads to duplicate entries in the database.
    wf = wf.strip()

    # get example text (and remove it)
    ex_regex = r'~.+~?'
    line = re.sub(ex_regex, '', line)
    
    # remove 'rule'
    rule_regex = r'\[EA?XAMPL: .+\]'
    line = re.sub(rule_regex, '', line)
        
    # get the corrections
    corrections = []
    corr_regex = r'#(?P<corr>.+)'
    m = re.search(corr_regex, line)
    if m:
        # Wordforms need to be stripped! 
        # Whitespace before or after wordforms also leads to duplicate entries in the database.
        corrections = [c.strip().replace('\t', '') for c in m.group('corr').split('#') if c != '' and len(c) < 100] 

    return wf, corrections

corrections = []

# File is in windows-1252 encoding and needs to be converted to utf-8
in_file = '/home/jvdzwaan/Downloads/TWENTE.noxml.2002.sq.clean.norm.tok.sortu.unifrq.LC.noapekrol.allasterisk.12.withcorrections.12186.txt'

num_lines = 0
with open(in_file) as f:
    for line in f:
        num_lines += 1
        #print(repr(line))
        wf, corr = parse_line(line)
        if wf is not None:
            for c in corr:
                corrections.append({'wf': wf, 'corr': c})
        #else:
        #    print(line)
        
data = pd.DataFrame(corrections)
print(num_lines)

In [None]:
parse_line('*variëiten	1#1#variëteiten\n')

In [None]:
parse_line('*toestemmignbesluit#toestemmingenbesluit	1\n')

In [None]:
data.head()

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, WordformLink, WordformLinkSource

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time
from ticclat.dbutils import add_lexicon_with_links

with session_scope(Session) as session:
    add_lexicon_with_links(session, 'TWENTE.noxml.2002.sq.clean.norm.tok.sortu.unifrq.LC.noapekrol.allasterisk.12.withcorrections.12186', 
                           False, data, 'wf', 'corr', False, True)