In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
import os
from ticclat.dbutils import create_ticclat_database

create_ticclat_database(dbname=os.environ['dbname'], user=os.environ['user'], passwd=os.environ['password'])

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
elex_words_file = '/home/jvdzwaan/data/ticclat/TICCLAT/elex/e-Lex-1.1.uniq.utf8.txt'
elex_lemma_file = '/home/jvdzwaan/data/ticclat/TICCLAT/elex/e-Lex-1.1.lemma_wordform.utf8.nonumbers.tsv'

l_wf_pairs = pd.read_csv(elex_lemma_file, sep='\t', header=None)
l_wf_pairs.columns = ['lemma', 'variant']
print(l_wf_pairs.shape)
l_wf_pairs.head()

In [None]:
wfs = pd.DataFrame()
wfs['wordform'] = l_wf_pairs['lemma'].append(l_wf_pairs['variant'], ignore_index=True)
wfs = wfs.drop_duplicates(subset='wordform')
print(wfs.shape)
wfs.head()

In [None]:
from ticclat.utils import write_json_lines, read_json_lines 
from ticclat.dbutils import add_lexicon, get_word_frequency_df

def add_linked_lexicon(sesion, lexicon_name, vocabulary, wfs, from_column, to_column, from_correct, to_correct, batch_size=10000):
    # Make a dataframe containing all wordforms in the lexicon
    wordforms = pd.DataFrame()
    wordforms['wordform'] = wfs[from_column].append(wfs[to_column], ignore_index=True)
    wordforms = wordforms.drop_duplicates(subset='wordform')
    
    # Create the lexicon (with all the wordforms)
    lexicon = add_lexicon(session, lexicon_name, vocabulary, wordforms, num=batch_size)
    
    # Add wordform links and source wordform links
    #for idx, row in wfs.iterrows():
    #    wf_from = session.query(Wordform).filter(Wordform.wordform == row[from_column]).first()
    #    wf_to = session.query(Wordform).filter(Wordform.wordform == row[to_column]).first()
        
    #    wf_from.link_with_metadata(wf_to, from_correct, to_correct, lexicon)

    return lexicon

In [None]:
%%time
from ticclat.dbutils import add_lexicon, get_word_frequency_df

with session_scope(Session) as session:
    df = get_word_frequency_df(session, add_ids=True)
    print(df.head())

In [None]:
from collections import defaultdict

wf_mapping = defaultdict(int)
wf_mapping = df['wordform_id'].to_dict(wf_mapping)

In [None]:
%%time
from ticclat.dbutils import get_wf_mapping

with session_scope(Session) as session:
    wf_mapping = get_wf_mapping(session, lexicon_id=1)
    
print(len(wf_mapping))

In [None]:
l_wf_pairs.duplicated().sum()

In [None]:
#%%time
from ticclat.ticclat_schema import WordformLink
from ticclat.utils import write_json_lines, get_temp_file, json_line
from sqlalchemy import exists, select

def write_wf_links_and_wf_link_sources_to_add(session, wf_mapping, df, wf_from_name, wf_to_name, 
                                              lexicon_id, wf_from_correct, wf_to_correct, 
                                              wf_links_to_add_file, wf_link_sources_to_add_file):
    num_wf_links = 0
    num_wf_link_sources = 0
    with open(wf_links_to_add_file, 'w') as links, open(wf_link_sources_to_add_file, 'w') as sources:
        wf_links = defaultdict(bool)
        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
            #print(row)
            wf_from = wf_mapping[row[wf_from_name]]
            wf_to = wf_mapping[row[wf_to_name]]
            if wf_from != wf_to and (wf_from, wf_to) not in wf_links:  # don't add links to self! and keep track of what was added, because duplicates may occur
                s = select([WordformLink]).where(WordformLink.wordform_from==wf_from and WordformLink.wordform_to==wf_to)
                r = session.execute(s).fetchone()
                if r is None:
                    # Both directions of the relationship need to be added.
                    links.write(json_line({'wordform_from': wf_from, 'wordform_to': wf_to}))
                    links.write(json_line({'wordform_from': wf_to, 'wordform_to': wf_from}))
                    
                    num_wf_links += 2
                # the wordform link sources (in both directions) need to be written regardless of the existence of the wordform links.
                sources.write(json_line({'wordform_from': wf_from, 'wordform_to': wf_to, 'lexicon_id': lexicon_id, 
                                         'wordform_from_correct': wf_from_correct, 'wordform_to_correct': wf_to_correct}))
                sources.write(json_line({'wordform_from': wf_to, 'wordform_to': wf_from, 'lexicon_id': lexicon_id, 
                                         'wordform_from_correct': wf_to_correct, 'wordform_to_correct': wf_from_correct}))
                num_wf_link_sources += 2
                
                wf_links[(wf_from, wf_to)] = True
                wf_links[(wf_to, wf_from)] = True
    
    return num_wf_links, num_wf_link_sources

with session_scope(Session) as session:
    wf_links_to_add_file = get_temp_file()
    wf_link_sources_to_add_file = get_temp_file()
    print('wf_links', wf_links_to_add_file)
    print('wf_link_sources', wf_link_sources_to_add_file)
    
    num_l, num_s = write_wf_links_and_wf_link_sources_to_add(session, wf_mapping, l_wf_pairs, 
                                                             'lemma', 'variant', 1, True, True, 
                                                             wf_links_to_add_file, wf_link_sources_to_add_file)
print(num_l, 'wordform links to add')
print(num_s, 'wordform link sources to add')

In [None]:
for w, w_id in wf_mapping.items():
    if w_id == 110:
        print(w)

In [None]:
l_wf_pairs.query('lemma == "aai"')

In [None]:
l_wf_pairs.query('variant == "aai"')

aai is zowel lemma als variant met aaien. Dit resulteert in het proberen toe te voegen van duplicaten. Dus we moeten bijhouden wat al geschreven is.

In [None]:
%%time
from ticclat.utils import read_json_lines
from ticclat.sacoreutils import sql_insert_batches

with session_scope(Session) as session:
    sql_insert_batches(session, WordformLink, read_json_lines(wf_links_to_add_file))
    sql_insert_batches(session, WordformLinkSource, read_json_lines(wf_link_sources_to_add_file))
    

In [None]:
%%time
with session_scope(Session) as session:
    add_linked_lexicon(session, 'e-Lex-1.1.lemma_wordform.utf8.nonumbers', True, l_wf_pairs, 'lemma', 'variant', True, True)