In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()
            
os.environ['lexicon_name'] = os.environ['dbname']

In [None]:
import MySQLdb
import textwrap

import pandas as pd


def do_query(query):
    cn = MySQLdb.connect(host='localhost', 
                         port=3306,
                         user=os.environ.get('user'), 
                         passwd=os.environ.get('password'),
                         db=os.environ.get('dbname'))
    df_mysql = pd.read_sql(query, con=cn)    
    cn.close()
    # deduplicate columns
    df_mysql = df_mysql.loc[:,~df_mysql.columns.duplicated()]
    return df_mysql

tables = do_query('SHOW TABLES;')
tables

In [None]:
wfs = do_query('SELECT * FROM wordforms;')
wfs.head()

In [None]:
print(wfs.shape)
print(len(wfs['wordform'].unique()))

There are some duplicate wordforms. This is a violation of the uniqueness constraint on wordform. So, we need to filter the dataframe before adding it to the database.

Also `has_analysis` is set to `False`, because we might be using `bulk_save_objects`, which doesn't set defaults.

In [None]:
wfs = wfs.drop_duplicates(subset='wordform')
wfs['has_analysis'] = False
wfs.shape

In [None]:
# save as frequency list for ticcl
wfs['freq'] = 1
wfs.head()
wfs.to_csv(os.environ['lexicon_name'], sep='\t', header=False, index=False, columns=['wordform', 'freq'])

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

In [None]:
Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
with session_scope(Session) as session:
    lex = Lexicon(lexicon_name=os.environ['lexicon_name'])
    wf = Wordform(wordform_id=528954, 
                  wordform='tuyld',
                  has_analysis=False,
                  wordform_lowercase='tuyld')
    wf.lexica.append(lex)
    session.add(wf)

In [None]:
with session_scope(Session) as session:
    a = Anahash(anahash=46901904807)
    session.add(a)
    wf = Wordform(wordform='uit',
                  has_analysis=False,
                  wordform_lowercase='uit')
    wf.anahash = a
    session.add(wf)
    

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())

In [None]:
%%time
from ticclat.dbutils import get_or_create_wordform

with session_scope(Session) as session:
    #lex = Lexicon(lexicon_name=os.environ['lexicon_name'])
    # We can't use apply, because apply calls the function twice for the first row, see
    # http://pandas.pydata.org/pandas-docs/stable/groupby.html#flexible-apply
    for idx, row in tqdm(wfs.iterrows(), total=wfs.shape[0]):
        #print(idx)
        wf = get_or_create_wordform(session, row['wordform'], has_analysis=False)

In [None]:
%%time
from ticclat.dbutils import bulk_add_wordforms

with session_scope(Session) as session:
    n = bulk_add_wordforms(session, wfs, num=10000)
print('added {} wordforms'.format(n))

In [None]:
%%time
from ticclat.dbutils import add_lexicon

with session_scope(Session) as session:
    add_lexicon(session, os.environ['lexicon_name'], wfs)

In [None]:
# This should give a single result!
with session_scope(Session) as session:
    data = session.query(Wordform).filter(Wordform.wordform == 'dóór').all()
    for wf in data:
        print(wf)

In [None]:
from ticclat.dbutils import get_word_frequency_df

with session_scope(Session) as session:
    df = get_word_frequency_df(session)
print(df.head())

In [None]:
%%time
hashes = pd.read_csv('{}.clean.list'.format(os.environ['lexicon_name']), 
                     sep='\t', 
                     header=None, 
                     keep_default_na=False)  # make sure word 'null' is read as string and not NaN
hashes.columns = ['wordform', 'anahash']
hashes = hashes.set_index('wordform', verify_integrity=True)
print(hashes.shape)
hashes.head()

In [None]:
%%time
from ticclat.dbutils import bulk_add_anahashes

with session_scope(Session) as session:
    total = bulk_add_anahashes(session, hashes)
print(total)

In [None]:
%%time
from ticclat.dbutils import connect_anahases_to_wordforms

with session_scope(Session) as session:
    total = connect_anahases_to_wordforms(session, hashes)
print(total)

In [None]:
with session_scope(Session) as session:
    wfs = session.query(Wordform).filter(Wordform.anahash_id == None).all()
print(len(wfs))

In [None]:
from ticclat.dbutils import get_word_frequency_df

with session_scope(Session) as session:
    df = get_word_frequency_df(session)
print(df.head())
print(df.shape)

Many wordforms have not been assigned an anahash value. Maybe not clean the frequency list?