In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('DEBUG')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat'
#db_name = 'ticclat_test'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource, lexical_source_wordform

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
from ticclat.queries import wfs_min_num_lexica

with session_scope(Session) as session:
    r = wfs_min_num_lexica(session)

In [None]:
for row in r:
    print(row)

In [None]:
from ticclat.queries import count_unique_wfs_in_corpus

with session_scope(Session) as session:
    r = count_unique_wfs_in_corpus(session, corpus_name='SoNaR-500')

In [None]:
print(r.fetchall())

In [None]:
from ticclat.queries import wordform_in_corpus_over_time

def wf_frequencies(session, wf, corpus_name):
    r = wordform_in_corpus_over_time(session, wf=word, corpus_name=corpus_name)

    records = [row for row in r.fetchall()]
    df = pd.DataFrame.from_records(records, columns=['wordform_id', 'wordform', 'pub_year', 'document_frequency', 'term_frequency'])
    df.sort_values(by=['pub_year'], inplace=True)
    df['term_frequency'] = df['term_frequency'].astype(int)
    
    return df


word = 'regeering'
corpus_name='Staten Generaal Digitaal'
with session_scope(Session) as session:
    df = wf_frequencies(session, word, corpus_name)
df.plot(x='pub_year', y=['term_frequency', 'document_frequency'], figsize=(10,5), grid=True, title=f'\"{word}\" in {corpus_name}')

In [None]:
word = 'regering'
corpus_name='Staten Generaal Digitaal'
with session_scope(Session) as session:
    df2 = wf_frequencies(session, word, corpus_name)
df2.plot(x='pub_year', y=['term_frequency', 'document_frequency'], figsize=(10,5), grid=True, title=f'\"{word}\" in {corpus_name}')

In [None]:
from ticclat.queries import wordform_in_corpora

word = 'regering'
corpus_name='Staten Generaal Digitaal'

with session_scope(Session) as session:
    r = wordform_in_corpora(session, wf=word)
records = [row for row in r]
df = pd.DataFrame.from_records(records, columns=['wordform_id', 'wordform', 'pub_year', 'document_frequency', 'term_frequency'])
df.sort_values(by=['pub_year'], inplace=True)
df['term_frequency'] = df['term_frequency'].astype(int)
df.plot(x='pub_year', y=['term_frequency', 'document_frequency'], figsize=(10,5), grid=True, title=f'\"{word}\" in {corpus_name}')

In [None]:
records = [row for row in r]
df = pd.DataFrame.from_records(records)
df

In [None]:
df