In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time

import textacy

wiki = '/home/jvdzwaan/data/tmp/nlwiki-10'

c = textacy.Corpus(textacy.load_spacy('nl', disable=('parser', 'tagger')),
               texts=textacy.io.read_text(wiki, lines=True))

In [None]:
c

In [None]:
%%time

vectorizer = textacy.Vectorizer()
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)for doc in c))

In [None]:
doc_term_matrix

In [None]:
vectorizer.terms_list[:5]

In [None]:
print(len(vectorizer.terms_list))

In [None]:
wfs = pd.DataFrame()
wfs['wordform'] = vectorizer.terms_list
wfs['has_analysis'] = False
wfs.head()

In [None]:
wfs['wordform'].is_unique

In [None]:
wfs['len'] = wfs.apply(lambda row: len(row['wordform']), axis=1)
wfs['len'].max()

In [None]:
wfs['len'].hist(bins=50, figsize=(15,8))

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())

In [None]:
from ticclat.dbutils import bulk_add_wordforms

with session_scope(Session) as session:
    bulk_add_wordforms(session, wfs, 1000)

In [None]:
%%time

from ticclat.ticclat_schema import Corpus, Document

# we know all wordforms in de documents are in the database.
# Now we can add a document

with session_scope(Session) as session:
    # Create corpus
    #corpus = Corpus(name='nlwiki-20190201-1000')
    #session.add(corpus)
    
    # select wordforms from document
    


In [None]:
print(doc_term_matrix.shape)

In [None]:
%%time

from ticclat.ticclat_schema import Document, TextAttestation

n = 1
prev_i = 0
words = []
freqs = {}

cx = doc_term_matrix.tocoo()    
for i,j,v in zip(cx.row, cx.col, cx.data):
    wf = vectorizer.id_to_term[j]
    words.append(wf)
    freqs[wf] = v
    if i != prev_i:
        prev_i = i
        n += 1
        
        with session_scope(Session) as session:
            q = session.query(Wordform)
            result = q.filter(Wordform.wordform.in_(words)).all()
                        
            d = Document(word_count=sum(freqs.values()), pub_year=2019, language='nl')
            session.add(d)
            for wf in result:
                #print(wf.wordform, freqs[wf.wordform])
                ta = TextAttestation(ta_document=d, ta_wordform=wf, frequency=freqs[wf.wordform])
                session.add(ta)
        
        words = []
        freqs = {}
        print('added', str(d))

print(n)

In [None]:
textacy.vsm.matrix_utils.get_term_freqs(doc_term_matrix)

In [None]:
dlen = textacy.vsm.matrix_utils.get_doc_lengths(doc_term_matrix)
print(dlen.shape)