In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()
            
os.environ['lexicon_name'] = os.environ['dbname']

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
from ticclat.dbutils import get_word_frequency_df

with session_scope(Session) as session:
    df = get_word_frequency_df(session)

hash_file = 'vocabulary'

print(df.head())
df.to_csv(hash_file, sep='\t', header=False)

In [None]:
import sh

alphabet_file = '/home/jvdzwaan/data/ticclat/ticcl/nld.aspell.dict.lc.chars'

res = sh.TICCL_anahash(['--list', '--alph', alphabet_file, hash_file])

print(res)

In [None]:
anahashes_file = 'vocabulary.list'

anahashes = pd.read_csv(anahashes_file, sep='\t', header=None, names=['anahash'], 
                        index_col=0, keep_default_na=False)  # make sure word 'null' is read as string and not NaN
anahashes.head()

In [None]:
%%time

from ticclat.dbutils import get_word_frequency_df
from ticclat.utils import anahash_df

with session_scope(Session) as session:
    df = get_word_frequency_df(session)

print(df.head())
print(df.shape)
ah = anahash_df(df, alphabet_file)
print(ah.shape)
ah.head()

In [None]:
%%time

from ticclat.dbutils import bulk_add_anahashes

with session_scope(Session) as session:
    total = bulk_add_anahashes(session, ah)
print('num of anahashes added:', total)

In [None]:
%%time

from ticclat.dbutils import connect_anahases_to_wordforms

with session_scope(Session) as session:
    total = connect_anahases_to_wordforms(session, anahashes)
print('num of wordforms connected to anahashes:', total)