In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
#db_name = 'ticclat_benchmark'
db_name = 'ticclat_wikipedia'
#db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from sqlalchemy import create_engine

from sqlalchemy_utils import database_exists

from ticclat.ticclat_schema import Base

engine = create_engine("mysql://{}:{}@localhost/{}?charset=utf8mb4".format(os.environ['user'], 
                                                            os.environ['password'], 
                                                            os.environ['dbname']))

print(database_exists(engine.url))

# create tables
Base.metadata.create_all(engine)

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource, lexical_source_wordform

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())
    print('number of documents:', session.query(Document).count())
    print('number of corpora:', session.query(Corpus).count())

In [None]:
%%time
# select all wordforms
from sqlalchemy import select
from sqlalchemy.sql import func, distinct

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

vocabulary = []

with session_scope(Session) as session:
    q = select([Wordform])
    r = session.execute(q)
    for wf in r.fetchall():
        vocabulary.append(wf['wordform'])

In [None]:
print(len(vocabulary))

In [None]:
%%time
import pickle

with open('wikipedia_wordforms.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)

In [None]:
%%time
import pickle

with open('wikipedia_wordforms.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

In [None]:
%%time
# create a benchmark database
from ticclat.benchmark import ingest_corpora, ingest_lexica, ingest_linked_lexica
from ticclat.dbutils import update_anahashes

num_wordforms = 100000

alphabet_file = '/home/jvdzwaan/data/ticclat/ticcl/nld.aspell.dict.lc.chars'

with session_scope(Session) as session:
    ingest_corpora(session, num_corpora=20, num_documents_min=50, num_documents_max=200,
                   language='nl', year_min=2000, year_max=2005, num_tokens_min=25000,
                   num_tokens_max=50000, vocabulary=vocabulary[:num_wordforms])
    ingest_lexica(session, num_lexica=10, num_wf_min=10000, num_wf_max=25000, vocabulary=vocabulary[:num_wordforms])
    ingest_linked_lexica(session, num_lexica=10, num_wf_min=10000, num_wf_max=25000,
                         vocabulary=vocabulary[:num_wordforms])
    update_anahashes(session, alphabet_file)

In [None]:
# do queries
#
# for single wordforms:
# * Given a wordform, give word frequencies per year
# * Given a wordform, in what corpora does it occur, with what frequencies
# * Give me all word(forms) that are related to this word(form) -> what does related mean?
#
# aggregate over wordforms
# * wordforms that occur in at least two lexicons
# * wordforms that occur in at least two corpora
# * list of lexicons and number of wordforms in lexicon
# * list of corpora and number of wordforms in corpus

