In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
%%time
# read error data
import re

def parse_line(line):
    # the wordform starts with *
    # corrections start with # (there can be multiple) and end with whitspace or ~
    # example text start with ~
    # 'rules' start with <space>[
    
    # get the wordform
    wf_regex = r'^\*(?P<wf>.+?)[\t#]'
    m = re.match(wf_regex, line)
    wf = m.group('wf')
    # Wordforms need to be stripped! 
    # Whitespace before or after wordforms also leads to duplicate entries in the database.
    wf = wf.strip()

    # get example text (and remove it)
    ex_regex = r'~.+~?'
    line = re.sub(ex_regex, '', line)
    
    # remove 'rule'
    rule_regex = r'\[EA?XAMPL: .+\]'
    line = re.sub(rule_regex, '', line)
        
    # get the corrections
    corrections = []
    corr_regex = r'#(?P<corr>.+)'
    m = re.search(corr_regex, line)
    if m:
        # Wordforms need to be stripped! 
        # Whitespace before or after wordforms also leads to duplicate entries in the database.
        corrections = [c.strip().replace('\t', '') for c in m.group('corr').split('#') if c != '' and len(c) < 100] 

    return wf, corrections

corrections = []

# File is in windows-1252 encoding and needs to be converted to utf-8
in_file = '/home/jvdzwaan/Downloads/TWENTE.noxml.2002.sq.clean.norm.tok.sortu.unifrq.LC.noapekrol.allasterisk.12.withcorrections.12186.txt'

num_lines = 0
with open(in_file) as f:
    for line in f:
        num_lines += 1
        #print(repr(line))
        wf, corr = parse_line(line)
        if wf is not None:
            for c in corr:
                corrections.append({'wf': wf, 'corr': c})
        #else:
        #    print(line)
        
data = pd.DataFrame(corrections)
print(num_lines)

In [None]:
parse_line('*variëiten	1#1#variëteiten\n')

In [None]:
parse_line('*toestemmignbesluit#toestemmingenbesluit	1\n')

In [None]:
data.head()

In [None]:
data['wf'].is_unique

In [None]:
data.shape

In [None]:
import MySQLdb
import textwrap

import pandas as pd


def do_query(query):
    cn = MySQLdb.connect(host='localhost', 
                         port=3306,
                         user=os.environ.get('user'), 
                         passwd=os.environ.get('password'),
                         db=os.environ.get('dbname'))
    df_mysql = pd.read_sql(query, con=cn)    
    cn.close()
    # deduplicate columns
    df_mysql = df_mysql.loc[:,~df_mysql.columns.duplicated()]
    return df_mysql

tables = do_query('SHOW TABLES;')
#tables

In [None]:
wordforms = do_query('SELECT * FROM wordforms;')
wordforms.head()

In [None]:
lexicon_wfs = set(wordforms['wordform'])
error_wfs = set(data['corr'])

new_words = error_wfs.difference(lexicon_wfs)
print(len(new_words))

In [None]:
import edlib

data['ed'] = data.apply(lambda row: edlib.align(row['wf'], row['corr'])['editDistance'], axis=1)
data

In [None]:
data['ed'].max()

In [None]:
data['ed'].hist(bins=50, figsize=(15,8))

In [None]:
data.query('ed > 2')

In [None]:
# How many corrections per wordform?
data['one'] = 1
d = data.groupby('wf').sum()
d[d['one'] > 1]

## Add spelling correction list to database

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
wfs = pd.DataFrame()
wfs['wordform'] = pd.concat([data['corr'], data['wf']])
wfs = wfs.drop_duplicates(subset='wordform')
wfs['has_analysis'] = False
wfs.head()

In [None]:
wfs.shape

In [None]:
wfs['wordform'].is_unique

In [None]:
data.head()

In [None]:
data.shape

In [None]:
%%time
from ticclat.dbutils import add_lexicon

with session_scope(Session) as session:
    add_lexicon(session, 'Twente spelling correction list', wfs)

### Now add the links between the words...

In [None]:
%%time

with session_scope(Session) as session:
    for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
        #print(row['wf'], row['corr'])
        wf = session.query(Wordform).filter(Wordform.wordform == row['wf']).first()
        corr = session.query(Wordform).filter(Wordform.wordform == row['corr']).first()
        #print(wf.wordform_id, corr.wordform_id)
        wf.links.append(corr)
        corr.links.append(wf)

In [None]:
with session_scope(Session) as session:
    for idx, row in data.head().iterrows():
        print(row['wf'])
        wf = session.query(Wordform).filter(Wordform.wordform == row['wf']).first()
        print(wf, [str(w) for w in wf.links])
        corr = session.query(Wordform).filter(Wordform.wordform == row['corr']).first()
        print(corr, [str(w) for w in corr.links])

Now, how to add the link to the lexicon to the relation between wordforms?

https://docs.sqlalchemy.org/en/latest/orm/basic_relationships.html#many-to-many