In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat_wikipedia'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())
    print('number of documents:', session.query(Document).count())
    print('number of corpora:', session.query(Corpus).count())

In [None]:
# note: must install nltk for this! This used to be in ticclat.tokenize, but it was no longer used anywhere but in this notebook, so we took it out of the package dependencies. Note that it is also still used in some tests, but we have a separate utility function in the tests directory for that.

import nltk.data
from nltk import word_tokenize

def nltk_tokenize(texts_file, punkt='tokenizers/punkt/dutch.pickle'):
    """
    Inputs:
        texts_file (str): File name of a file that contains the texts. This
            should contain one document per line.
        punkt (str): Path to the nltk punctuation data to be used.

    Yields:
        Counter: term-frequency vector representing a document.
    """
    nltk.download('punkt')
    tokenizer = nltk.data.load(punkt)

    with open(texts_file) as f:
        for line in f:
            tokens = [word_tokenize(sent)
                      for sent in tokenizer.tokenize(line.strip())]

            yield list(chain(*tokens))

In [None]:
%%time
# Ingest wikipedia dump as corpus
import os

from tqdm import tqdm_notebook as tqdm

from ticclat.utils import get_temp_file, write_json_lines, read_json_lines
from ticclat.tokenize import terms_documents_matrix_word_lists
from ticclat.sacoreutils import add_corpus_core


wiki = '/home/jvdzwaan/data/tmp/nlwiki'
corpus_name = 'nlwiki-20190201-pages-articles-complete'

print('Tokenizing corpus')
tokenized_file = '/home/jvdzwaan/data/tmp/nlwiki-json_lines'
num_documents = write_json_lines(tokenized_file, tqdm(nltk_tokenize(wiki)))

In [None]:
%%time
from ticclat.tokenize import terms_documents_matrix_word_lists

print('Creating the terms/document matrix')
documents_iterator = read_json_lines(tokenized_file)

corpus_m, v = terms_documents_matrix_word_lists(documents_iterator)

In [None]:
os.remove(tokenized_file)

In [None]:
%%time
wfs = pd.DataFrame()
wfs['wordform'] = v.vocabulary_

document_metadata = pd.DataFrame()
document_metadata['language'] = ['nl' for i in range(num_documents)]
document_metadata['pub_year'] = 2019
# More metadata?

with session_scope(Session) as session:
    add_corpus_core(session, corpus_m, v, corpus_name, document_metadata)