In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()
            
os.environ['lexicon_name'] = os.environ['dbname']

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time
# Ingest Sonar-500 corpus
from ticclat.tokenize import terms_documents_matrix_ticcl_frequency
from ticclat.sacoreutils import add_corpus_core

from nlppln.utils import get_files

in_dir = '/home/jvdzwaan/data/ticclat/TICCLAT/SONAR500/'
in_files = get_files(in_dir)
#in_files = in_files[:2]

corpus_matrix, vectorizer = terms_documents_matrix_ticcl_frequency(in_files)

document_metadata = pd.DataFrame()
document_metadata['title'] = [os.path.basename(f).split('.', 1)[0] for f in in_files]
document_metadata['language'] = 'nl'
# More metadata?

with session_scope(Session) as session:
    add_corpus_core(session, corpus_matrix, vectorizer, 'SoNaR-500', document_metadata)


In [None]:
import bz2

freq_file = '/home/jvdzwaan/data/ticclat/TICCLAT/SGD/SGD.PolMash.unifrq.1814.clean'

try:
    file_open = bz2.open(freq_file, 'rt')
    with file_open as f:
        for line in f:
            word, freq = line.split()
except Exception as e:
    raise(e)

In [None]:
from ticclat.tokenize import ticcl_frequency_bz2

file2 = '/home/jvdzwaan/data/ticclat/TICCLAT/SONAR500/WR-P-E-A_discussion_lists.wordfreqlist.clean.tsv.bz2'

for s in ticcl_frequency_bz2([freq_file, file2]):
    print('hop')

In [None]:
%%time
from ticclat.ingest.sdg import ingest

ingest(Session, base_dir='/home/jvdzwaan/data/ticclat/TICCLAT/')

In [None]:
%%time
from ticclat.ingest.edbo import ingest

ingest(Session, base_dir='/home/jvdzwaan/data/ticclat/TICCLAT/')