In [None]:
%load_ext autoreload
%autoreload 2

## To do

* Add relationships between models (should make processing an xml file faster)
* Use sessions better: https://docs.sqlalchemy.org/en/latest/orm/session_basics.html#when-do-i-construct-a-session-when-do-i-commit-it-and-when-do-i-close-it
* Add multiple documents
* Extract vocabulary

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'lexicon_test'
os.environ['dbname'] = db_name

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import database_exists, create_database

engine = create_engine("mysql://{}:{}@localhost/{}".format(os.environ['user'], 
                                                           os.environ['password'], 
                                                           os.environ['dbname']))
if not database_exists(engine.url):
    create_database(engine.url)

print(database_exists(engine.url))

Session = sessionmaker(bind=engine)

In [None]:
from lexicon_schema import AnalyzedWordform, Document, Lemmata, TokenAttestation, Wordform, Base

In [None]:
# create tables
Base.metadata.create_all(engine)

In [None]:
from sqlalchemy import inspect

inspector = inspect(engine)

# Get table information
print(inspector.get_table_names())

In [None]:
%%time
import pandas as pd
from lxml import etree
from tqdm import tqdm
import re

def process_folia(fname, session):
    # Extract document properties and insert into database (store document id)
    context = etree.iterparse(fname, events=('start', ), tag=('{http://ilk.uvt.nl/folia}FoLiA'))
    for event, elem in context:
        doc_id = elem.attrib['{http://www.w3.org/XML/1998/namespace}id']
        break
        
    doc = Document(doc_id)
    session.add(doc)
    session.commit()
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('{http://ilk.uvt.nl/folia}w'))
    for event, elem in tqdm(context):
        if elem.attrib['class'] == "WORD":
            for child in elem.getchildren():
                #print(child.tag)
                tag = child.tag
                if tag == '{http://ilk.uvt.nl/folia}t':
                    wordform = child.text
                elif tag == '{http://ilk.uvt.nl/folia}pos':
                    postag = child.attrib['head']
                elif tag == '{http://ilk.uvt.nl/folia}lemma':
                    lemma = child.attrib['class']
            
            # add wordform if necessary
            wf = session.query(Wordform).filter(Wordform.wordform==wordform).first()
            if wf is None:
                #print('Adding wordform:', wordform)
                wf = Wordform(wordform)
                session.add(wf)
                session.commit()
            wf_id = wf.wordform_id
            
            # add lemma if necessary
            lm = session.query(Lemmata).filter(Lemmata.modern_lemma==lemma, 
                                               Lemmata.lemma_part_of_speech==postag).first()
            if lm is None:
                #print('Adding lemma:', lemma)
                lm = Lemmata(lemma, postag)
                session.add(lm)
                session.commit()
            lm_id = lm.lemma_id
            
            # add analyzed_wordform
            awf = AnalyzedWordform(postag, lm_id, wf_id)
            session.add(awf)
            session.commit()
            
            # add token_attestation
            ta = TokenAttestation(awf.analyzed_wordform_id, doc.document_id)
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        
        session.commit()

session = Session()
process_folia('/home/jvdzwaan/data/embem/folia/original/alew001besl01_01.xml', session)
session.commit()
session.close()

In [None]:
def purge_db():
    session = Session()
    session.query(AnalyzedWordform).delete()
    session.query(Document).delete()
    session.query(Lemmata).delete()
    session.query(TokenAttestation).delete()
    session.query(Wordform).delete()
    session.commit()
    session.close()
        
purge_db()

In [None]:
from sqlalchemy_utils.functions import drop_database

drop_database(engine.url)

In [None]:
session = Session()
wf = Wordform('Opdragt')
session.add(wf)
session.commit()
session.close()

In [None]:
session = Session()
res = session.query(Wordform).filter(Wordform.wordform=="van").first()
session.close()
print(res)

In [None]:
print(res.wordform_id, res.wordform_lowercase)

In [None]:
session = Session()
res = session.query(Wordform).delete()
session.commit()
session.close()
print(res)

In [None]:
session = Session()
res = session.query(Wordform).filter(Wordform.wordform_id==8786).first()
session.close()
print(res.wordform)

In [None]:
session = Session()
res = pd.read_sql(session.query(AnalyzedWordform).statement,session.bind)
session.close()
res