In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('DEBUG')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat'
# db_name = 'ticclat_test'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource, lexical_source_wordform

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
import glob

in_dir = '/Users/jvdzwaan/data/ticclat/SGD_ticcl_variants/'

in_files = glob.glob('{}/*'.format(in_dir))

dfs = []

for in_file in in_files:
    df = pd.read_csv(in_file, sep='#', header=None)
    df.columns = ['ocr_variant', 'corpus_frequency', 'correction_candidate', '?1', 'ld', '?2', 'anahash']
    dfs.append(df)
data = pd.concat(dfs)
data.shape

In [None]:
from ticclat.dbutils import bulk_add_wordforms

with session_scope(Session) as session:
    # add wordforms we don't know yet (don't forget to include the correction candidates)
    wfs = pd.DataFrame()
    wfs['wordform'] = list(set(list(data['ocr_variant']) + list(data['correction_candidate'])))
    print(wfs.head())
    bulk_add_wordforms(session, wfs, preprocess_wfs=False)

In [None]:
from ticclat.dbutils import update_anahashes

alphabet_file="/Users/jvdzwaan/data/ticclat/ALPH/nld.aspell.dict.clip20.lc.LD3.charconfus.clip20.lc.chars"

with session_scope(Session) as session:
    # make sure we have anahashes for all wordforms
    update_anahashes(session, alphabet_file)

In [None]:
# add linked lexicon (ignore anahash differences for now)

from ticclat.dbutils import add_lexicon_with_links

with session_scope(Session) as session:
    name = 'SDG ticcl correction candidates'
    vocabulary = False
    from_column = 'ocr_variant'
    from_correct = False
    to_column = 'correction_candidate'
    to_correct = True
    preprocess_wfs = False
    to_add = ['ld']

    add_lexicon_with_links(session, name, vocabulary, data, from_column, to_column, from_correct, to_correct, preprocess_wfs=preprocess_wfs, to_add=to_add)