In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat_test'
os.environ['dbname'] = db_name

In [None]:
from ticclat.dbutils import create_ticclat_database

create_ticclat_database(delete_existing=True, dbname=os.environ['dbname'], user=os.environ['user'], passwd=os.environ['password'])

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
# add two lexicons

from ticclat.dbutils import add_lexicon

name1 = 'l1'
wfs1 = pd.DataFrame()
wfs1['wordform'] = ['wf1', 'wf2', 'wf3']

name2 = 'l2'
wfs2 = pd.DataFrame()
wfs2['wordform'] = ['wf2', 'wf3', 'wf4']


with session_scope(Session) as session:
    lex1 = add_lexicon(session, lexicon_name=name1, vocabulary=True, wfs=wfs1)
    lex2 = add_lexicon(session, lexicon_name=name2, vocabulary=True, wfs=wfs2)

In [None]:
# add a corpus
from ticclat.tokenize import terms_documents_matrix_counters
from ticclat.sacoreutils import add_corpus_core

name = 'corpus1'

documents = [['wf1', 'wf2'], ['wf2', 'wf3'], ['wf4', 'wf5', 'wf6']]

corpus_matrix, vectorizer = terms_documents_matrix_counters(documents)
print(corpus_matrix.shape)
print(vectorizer.vocabulary_)

metadata = pd.DataFrame()
metadata['title'] = ['doc1', 'doc2', 'doc3']
metadata['pub_year'] = [2018, 2011, 2019]

with session_scope(Session) as session:
    add_corpus_core(session, corpus_matrix, vectorizer, name, metadata)
    

In [None]:
# add another corpus
from ticclat.tokenize import terms_documents_matrix_counters
from ticclat.sacoreutils import add_corpus_core

name = 'corpus2'

documents = [['wf2', 'wf5'], ['wf4', 'wf5', 'wf6']]

corpus_matrix, vectorizer = terms_documents_matrix_counters(documents)
print(corpus_matrix.shape)
print(vectorizer.vocabulary_)

metadata = pd.DataFrame()
metadata['title'] = ['doc4', 'doc5']
metadata['pub_year'] = [2002, 2011]

with session_scope(Session) as session:
    add_corpus_core(session, corpus_matrix, vectorizer, name, metadata)

In [None]:
# add another corpus
from ticclat.tokenize import terms_documents_matrix_counters
from ticclat.sacoreutils import add_corpus_core

name = 'corpus3'

documents = [['wf2', 'wf5'], ['wf2', 'wf3', 'wf6'], ['wf2']]

corpus_matrix, vectorizer = terms_documents_matrix_counters(documents)
print(corpus_matrix.shape)
print(vectorizer.vocabulary_)

metadata = pd.DataFrame()
metadata['title'] = ['doc6', 'doc7', 'doc8']
metadata['pub_year'] = [2002, 2011, 2018]

with session_scope(Session) as session:
    add_corpus_core(session, corpus_matrix, vectorizer, name, metadata)

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Corpus, Document, TextAttestation

with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())
    print('number of corpora:', session.query(Corpus).count())
    print('number of documents:', session.query(Document).count())
    print('number of text attestations:', session.query(TextAttestation).count())