In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()
            
os.environ['lexicon_name'] = os.environ['dbname']

In [None]:
import MySQLdb
import textwrap

import pandas as pd


def do_query(query):
    cn = MySQLdb.connect(host='localhost', 
                         port=3306,
                         user=os.environ.get('user'), 
                         passwd=os.environ.get('password'),
                         db=os.environ.get('dbname'))
    df_mysql = pd.read_sql(query, con=cn)    
    cn.close()
    # deduplicate columns
    df_mysql = df_mysql.loc[:,~df_mysql.columns.duplicated()]
    return df_mysql

tables = do_query('SHOW TABLES;')
tables

In [None]:
wfs = do_query('SELECT * FROM wordforms;')
wfs.head()

In [None]:
print(wfs.shape)
print(len(wfs['wordform'].unique()))

There are some duplicate wordforms. This is a violation of the uniqueness constraint on wordform. So, we need to filter the dataframe before adding it to the database.

In [None]:
wfs = wfs.drop_duplicates(subset='wordform')
wfs.shape

In [None]:
# save as frequency list for ticcl
wfs['freq'] = 1
wfs.head()
wfs.to_csv(os.environ['lexicon_name'], sep='\t', header=False, index=False, columns=['wordform', 'freq'])

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import database_exists, create_database

engine = create_engine("mysql://{}:{}@localhost/{}?charset=utf8mb4".format(os.environ['user'], 
                                                                        os.environ['password'], 
                                                                        os.environ['dbname']))
print(database_exists(engine.url))

Session = sessionmaker(bind=engine)

In [None]:
from ticclat import Lexicon, Wordform, Anahash

In [None]:
# source: https://docs.sqlalchemy.org/en/latest/orm/session_basics.html
from contextlib import contextmanager

@contextmanager
def session_scope():
    """Provide a transactional scope around a series of operations."""
    session = Session()
    try:
        yield session
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

In [None]:
with session_scope() as session:
    lex = Lexicon(lexicon_name=os.environ['lexicon_name'])
    wf = Wordform(wordform_id=528954, 
                  wordform='tuyld',
                  has_analysis=False,
                  wordform_lowercase='tuyld')
    wf.lexica.append(lex)
    session.add(wf)

In [None]:
with session_scope() as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())

In [None]:
%%time

def create_wf(row, lex, session):
    #print(row['wordform_id'])
    wf = Wordform(wordform_id=row['wordform_id'], 
                  wordform=row['wordform'],
                  has_analysis=False,
                  wordform_lowercase=row['wordform_lowercase'])
    wf.lexica.append(lex)
    session.add(wf)
    #print(wf)
    #session.commit()

with session_scope() as session:
    lex = Lexicon(lexicon_name=os.environ['lexicon_name'])
    # We can't use apply, because apply calls the function twice for the first row, see
    # http://pandas.pydata.org/pandas-docs/stable/groupby.html#flexible-apply
    for idx, row in wfs.iterrows():
        #print(idx)
        create_wf(row, lex, session)

In [None]:
# This should give a single result!
with session_scope() as session:
    data = session.query(Wordform).filter(Wordform.wordform == 'dóór').all()
    for wf in data:
        print(wf)

In [None]:
hashes = pd.read_csv('{}.clean.list'.format(os.environ['lexicon_name']), sep='\t', header=None)
hashes.columns = ['wordform', 'hash']
print(hashes.shape)
hashes.head()

In [None]:
%%time
with session_scope() as session:
    for idx, row in hashes.iterrows():
        wf = session.query(Wordform).filter(Wordform.wordform == row['wordform']).first()
        if wf is None:
            print(row['wordform'])
        else:
            h = Anahash(anahash=row['hash'])
            session.add(h)
            wf.anahash = h    

Many wordforms cannot be found in the database. Maybe not clean the frequency list?