In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat_test'
#db_name = 'ticclat_wikipedia'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource, lexical_source_wordform

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time
from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func, desc

num_wordforms = 5

vocabulary = {}

# select random wordforms
with session_scope(Session) as session:
    q = select([Wordform]).order_by(func.random()).limit(num_wordforms)
    
    r = session.execute(q)
    for row in r.fetchall():
        vocabulary[row['wordform']] = row['wordform_id'] 
print(len(vocabulary))

In [None]:
from faker import Faker
fake = Faker()

num_corpora = np.random.randint(3, 10)
print('num corpora', num_corpora)

for c in range(num_corpora):
    corpus_name = 'Corpus {}'.format(i)
    num_documents = np.random.randint(10, 100)
    print('Corpus {}, {} documents'.format(c, num_documents))
    for d in range(num_documents):
        num_tokens = np.random.randint(100, 1000)
        print('Document {}, {} tokens'.format(d, num_tokens))
        
        print(fake.words(nb=num_tokens, ext_word_list=vocabulary, unique=False))        
        break
            
        

In [None]:
# generator that produces documents for a corpus
from faker import Faker

def random_corpus(num_documents, num_tokens_min, num_tokens_max, vocabulary):
    fake = Faker()
    for i in range(num_documents):
        num_tokens = np.random.randint(num_tokens_min, num_tokens_max)
        
        yield fake.words(nb=num_tokens, ext_word_list=vocabulary, unique=False)
        

for doc in random_corpus(10, 10, 15, vocabulary):
    print(len(doc))
        

In [None]:
def corpus_metadata(num_documents, language, year_min, year_max):
    md = pd.DataFrame()
    md['language'] = [language for i in range(num_documents)]
    md['pub_year'] = [np.random.randint(year_min, year_max) for i in range(num_documents)]
    
    return md

corpus_metadata(10, 'nl', 2010, 2020)

In [None]:
from ticclat.tokenize import terms_documents_matrix_word_lists

def generate_corpora(num_corpora, num_documents_min, num_documents_max,
                     language, year_min, year_max, num_tokens_min,
                     num_tokens_max, vocabulary):
    for i in range(num_corpora):
        num_documents = np.random.randint(num_documents_min,
                                          num_documents_max+1)

        md = corpus_metadata(num_documents, language, year_min, year_max+1)

        word_lists = random_corpus(num_documents, num_tokens_min,
                                   num_tokens_max+1, vocabulary)
        corpus, v = terms_documents_matrix_word_lists(word_lists)

        yield corpus, v, md
        

In [None]:
for c, v, m in generate_corpora(10, 1, 5, 'nl', 2010, 2015, 1, 5, vocabulary):
    print(c.shape)

In [None]:
from ticclat.sacoreutils import add_corpus_core

def ingest_corpora(session, num_corpora, num_documents_min, num_documents_max, 
                     language, year_min, year_max, num_tokens_min, num_tokens_max, vocabulary):
    for i, (corpus, v, md) in enumerate(generate_corpora(num_corpora, num_documents_min, num_documents_max,
                     language, year_min, year_max, num_tokens_min,
                     num_tokens_max, vocabulary)):
        name = f'Corpus {i}'
        #print(f'Adding {name}')
        add_corpus_core(session, corpus_matrix=corpus, vectorizer=v, corpus_name=name, document_metadata=md)

with session_scope(Session) as session:
    ingest_corpora(session, 10, 1, 5, 'nl', 2010, 2015, 1, 5, vocabulary)

In [None]:
from ticclat.dbutils import add_lexicon

def generate_lexica(num_lexica, num_wf_min, num_wf_max, vocabulary):
    fake = Faker()
    for i in range(num_lexica):
        num_wf = np.random.randint(num_wf_min, num_wf_max)
        
        wfs = pd.DataFrame()
        wfs['wordform'] = fake.words(nb=num_wf, ext_word_list=vocabulary, unique=True)
        
        yield wfs
        
def ingest_lexica(session, num_lexica, num_wf_min, num_wf_max, vocabulary):
    lexica = generate_lexica(num_lexica, num_wf_min, num_wf_max+1, vocabulary)
    for i, wfs in enumerate(lexica):
        name = f'Lexicon {i}'
        add_lexicon(session, lexicon_name=name, vocabulary=True, wfs=wfs)

with session_scope(Session) as session:
    ingest_lexica(session, num_lexica=3, num_wf_min=1, num_wf_max=3, vocabulary=vocabulary)

In [None]:
def generate_linked_lexica(num_lexica, num_wf_min, num_wf_max, vocabulary):
    fake = Faker()
    for i in range(num_lexica):
        num_wf = np.random.randint(num_wf_min, num_wf_max)
        if num_wf % 2 != 0:
            num_wf += 1
        print('num wordforms', num_wf)
        
        words = fake.words(nb=num_wf, ext_word_list=vocabulary, unique=True)

        wfs = pd.DataFrame()
        wfs['from'] = words[:num_wf/2]
        wfs['to'] = words[num_wf/2:]
        yield wfs
        
def ingest_lexica(session, num_lexica, num_wf_min, num_wf_max, vocabulary):
    lexica = generate_linked_lexica(num_lexica, num_wf_min, num_wf_max+1, vocabulary)
    for i, wfs in enumerate(lexica):
        name = f'Lexicon {i}'
        #add_lexicon(session, lexicon_name=name, vocabulary=True, wfs=wfs)

with session_scope(Session) as session:
    ingest_lexica(session, num_lexica=3, num_wf_min=1, num_wf_max=3, vocabulary=vocabulary)

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())
    print('number of documents:', session.query(Document).count())
    print('number of corpora:', session.query(Corpus).count())