In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time

import textacy

wiki = '/home/jvdzwaan/data/tmp/nlwiki-10000'

c = textacy.Corpus(textacy.load_spacy('nl', disable=('parser', 'tagger')),
               texts=textacy.io.read_text(wiki, lines=True))

In [None]:
%%time

vectorizer = textacy.Vectorizer()
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)for doc in c))

This takes way too long. Also it crashes due to memory issues when we try to load the entire wikipedia.

Okay, let's try some alternatives for loading the corpus and see how fast they are.

In [None]:
%%time
import spacy

nlp = spacy.load('nl', disable=['tagger', 'dep', 'ner', 'textcat'])

i = 0

wiki = '/home/jvdzwaan/data/tmp/nlwiki-1000'

with open(wiki) as f:
    for line in f:
        doc = nlp(line.strip())
        
        i += 1
        if i % 1000 == 0:
            break

In [None]:
%%time
from collections import Counter
import time
from itertools import chain
from nltk.corpus import brown
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import ToktokTokenizer
toktok = ToktokTokenizer()
string_corpus = brown.raw()

i = 0

with open(wiki) as f:
    for line in f:
        tokenized_corpus = [toktok.tokenize(sent) for sent in sent_tokenize(line.strip())]
        fdist = Counter(chain(*tokenized_corpus))
        
        i += 1
        #if i % 100000 == 0:
        #    break

In [None]:
%%time
import nltk.data

from nltk import sent_tokenize, word_tokenize

tokenizer = nltk.data.load('tokenizers/punkt/dutch.pickle')

i = 0

with open(wiki) as f:
    for line in f:
        tokenized_corpus = [word_tokenize(sent) for sent in tokenizer.tokenize(line.strip())]
        fdist = Counter(chain(*tokenized_corpus))
        
        i += 1
        
        #if i % 1000 == 0:
        #    break

In [None]:
with session_scope(Session) as session:
    print('number of wordforms:', session.query(Wordform).count())
    print('number of lexica:', session.query(Lexicon).count())
    print('number of documents:', session.query(Document).count())
    print('number of corpora:', session.query(Corpus).count())

In [None]:
%%time
# Add all the wordforms that occur in the wikipedia dump
from ticclat.tokenize import nltk_tokenize
from ticclat.dbutils import bulk_add_wordforms

num = 10000

wiki = '/home/jvdzwaan/data/tmp/nlwiki'

i = 0
dfs = []
with session_scope(Session) as session:
    for terms_vector in tqdm(nltk_tokenize(wiki)):
        df = pd.DataFrame()
        df['wordform'] = terms_vector.keys()
        #print(df.head())
        dfs.append(df)
        #print(terms_vector)
        #print(pd.DataFrame.from_dict(terms_vector, orient='index'))
        #break
        i += 1
    
        if i % num == 0:
            r = pd.concat(dfs)
            r = r.drop_duplicates(subset='wordform')
            n = bulk_add_wordforms(session, r, disable_pbar=True)
            print('Added {} wordforms'.format(n))
        
            dfs = []

    # also add the final documents
    print(len(dfs))
    if len(dfs) > 0:
        r = pd.concat(dfs)
        r = r.drop_duplicates(subset='wordform')
        n = bulk_add_wordforms(session, r, disable_pbar=True)
        print('Added {} wordforms'.format(n))

In [None]:
%%time

from ticclat.dbutils import add_corpus

wiki = '/home/jvdzwaan/data/tmp/nlwiki-10'

with session_scope(Session) as session:
    c = add_corpus(session, 'nlwiki-20190201-pages-articles-23', wiki)
    print('Added {} documents'.format(len(c.corpus_documents)))