In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [3]:
#matplotlib inline

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [4]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [5]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [12]:
import json

data_file = '/Users/jvdzwaan/Downloads/ticclat_morph_data_v2.json'

with open(data_file) as f:
    data = json.load(f)

In [19]:
len(data.keys())

559827

In [55]:
data['1']

{'morph_par': {'Z0001Y0001X0023W00000004V0001_HDU003': {'Z': '0001',
   'Y': '0001',
   'X': '0023',
   'W': '00000004',
   'V': '0001',
   'h': 'HDU',
   't': '003'},
  'Z0001Y0001X0022W00000101V0013_HDU011': {'Z': '0001',
   'Y': '0001',
   'X': '0022',
   'W': '00000101',
   'V': '0013',
   'h': 'HDU',
   't': '011'}},
 'dict_ids': ['MNW_08286_s',
  'MNW_08282_s',
  'MNW_08283_s',
  'MNW_08284_s',
  'MNW_08287_s',
  'WNT_M015579_s'],
 'pos': ['NUM', 'ART', 'PRN', 'ADJ', 'NOU-C'],
 'int_ids': ['INT_252671', 'INT_28854', 'INT_321512']}

In [6]:
data_file = '/Users/jvdzwaan/Downloads/CombilexTypolistINThistlex.TICCLATingest.DeriveParadigms265.tsv'

data = pd.read_csv(data_file, sep='\t', names=['wordform', 
                                                'corpus_freq', 
                                                'component_codes', 
                                                'ignore_human_readable_component_code', 
                                                'igore_first_year',
                                                'ignore_last_year',
                                                'dict_ids',
                                                'pos_tags',
                                                'int_ids'])
data = data.drop([0]) # drop first row (contains empty wordform)
data.head()

Unnamed: 0,wordform,corpus_freq,component_codes,ignore_human_readable_component_code,igore_first_year,ignore_last_year,dict_ids,pos_tags,int_ids
1,%,,Z0001Y0007X2124W00006579V0001_HDU002,Z1Y7X2124W6579V1_HDU2,1872.0,1872.0,WNT_M052997_m#WNT_M052997_s,NOU-C,INT_360068
2,%%%windstreek,,Z0001Y0003X2578W00054310V0001_HDU001,Z1Y3X2578W54310V1_HDU1,1808.0,1808.0,WNT_M086612_s,NOU-C,INT_319930
3,&ldquor,,Z0001Y0007X1400W00000286V0001_HDU059#Z0001Y000...,Z1Y7X1400W286V1_HDU59#Z1Y1X694W4560V6_HDU,1541.0,1541.0,MNW_70757_s#MNW_70680_s#WNT_M084040_s,NOU-C,INT_17349#INT_368858
4,'j,0.0,Z0001Y0015X1874W00000031V0003_HCM,Z1Y15X1874W31V3_HCM,,,,,
5,'k,16860.0,Z0001Y0009X0925W00000016V0006_HCM,Z1Y9X925W16V6_HCM,,,,,


In [95]:
data = data.head(100)

In [11]:
wfs = data[['wordform']]
wfs.head()

Unnamed: 0,wordform
1,%
2,%%%windstreek
3,&ldquor
4,'j
5,'k


In [7]:
import re

def split_component_code(code, wf):
    #regex=r'Z\d{4}Y\d{4}X\d{4}W\d{7}V\d{4}_.+'
    regex=r'Z(?P<Z>\d{4})Y(?P<Y>\d{4})X(?P<X>\d{4})W(?P<W>\d{8})V(?P<V>\d{4})_(?P<wt_code>.+)'
    #print(code)
    m = re.search(regex, code)
    if m:
        #print(m)
        #print(m.group('wt_code'))
        return {'Z': int(m.group('Z')), 
                'Y': int(m.group('Y')), 
                'X': int(m.group('X')), 
                'W': int(m.group('W')), 
                'V': int(m.group('V')), 
                'word_type_code': m.group('wt_code'), 
                'wordform': wf}
    else:
        #print(wf, code)
        return None

print(split_component_code('Z0001Y0007X2124W00006579V0001_HDU002', 'test'))
print(split_component_code('W00071259V0002_HCM', "'m"))


{'Z': 1, 'Y': 7, 'X': 2124, 'W': 6579, 'V': 1, 'word_type_code': 'HDU002', 'wordform': 'test'}
None


In [49]:
%%time
from collections import defaultdict

result = defaultdict(list)
for row in data.iterrows():
    codes = row[1]['component_codes'].split('#')
    wf = row[1]['wordform']
    for code in codes:
        result[wf].append(split_component_code(code, wf))

CPU times: user 50.3 s, sys: 261 ms, total: 50.5 s
Wall time: 50.6 s


In [9]:
result['1']

[{'Z': 1,
  'Y': 1,
  'X': 23,
  'W': 4,
  'V': 1,
  'word_type_code': 'HDU003',
  'wordform': '1'},
 {'Z': 1,
  'Y': 1,
  'X': 22,
  'W': 101,
  'V': 13,
  'word_type_code': 'HDU011',
  'wordform': '1'}]

In [23]:
from ticclat.dbutils import bulk_add_wordforms

# put wordforms in a dataframe
wfs = pd.DataFrame(data.keys(), columns=['wordform'])
print(wfs.sample(10))

# bulk add wordforms
with session_scope(Session) as session:
    bulk_add_wordforms(session, wfs)

2019-06-26 15:58:12,636 [16161] INFO     ticclat.dbutils,80	Bulk adding wordforms.


                    wordform
393629            sijd-ganck
490449        vertwijfeldste
495002             vetranden
260398        magnetiseerder
477869            vergrendel
437716        toebereydinghe
44101                 asemde
438023         toegankelijke
171510  godsdienstsociologen
297997         ondersneeuwen



  0%|          | 0/559827 [00:00<?, ?it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 32003.69it/s][A[A
  2%|▏         | 10000/559827 [00:01<01:28, 6239.72it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 32752.01it/s][A[A
  4%|▎         | 20000/559827 [00:03<01:23, 6452.34it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 33944.95it/s][A[A
  5%|▌         | 30000/559827 [00:04<01:19, 6676.87it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 32071.75it/s][A[A
  7%|▋         | 40000/559827 [00:05<01:16, 6804.93it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 31479.31it/s][A[A
  9%|▉         | 50000/559827 [00:07<01:13, 6903.03it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 31687.96it/s][A[A
 11%|█         | 60000/559827 [00:08<01:11, 6969.16it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 31635.60it/s][A[A
 13%|█▎        | 70000/559827 [00:10<01:09, 7003.16it/s][A

0it [00:00, ?it/s][A[A

10179it [00:00, 32055.19it/s][A[A
 14%|█▍        | 80000/559827

In [12]:
%%time
from sqlalchemy import select
from ticclat.utils import chunk_df, anahash_df, write_json_lines, \
    read_json_lines, get_temp_file, json_line

# lookup ids
with session_scope(Session) as session:
    s = select([Wordform]).where(Wordform.wordform.in_(wfs['wordform']))
    mapping = session.execute(s).fetchall()

CPU times: user 9.79 s, sys: 237 ms, total: 10 s
Wall time: 13 s


In [28]:
mapping[0].wordform, mapping[0].wordform_id

('1', 338053)

In [77]:
def result_iterator_json(result):
    for wf in result:
        mp = {'wordform_id': wf.wordform_id}
        for key, components in data[wf.wordform]['morph_par'].items():
            try:
                yield {'wordform_id': wf.wordform_id, 
                       'Z': int(components['Z']), 
                       'Y': int(components['Y']), 
                       'X': int(components['X']), 
                       'W': int(components['W']), 
                       'V': int(components['V']), 
                       'word_type_code': components['h'],
                       'key': key}
            except Exception as e:
                print(e)
                print(data[wf.wordform]['morph_par'].keys())
                print(key)
                print(components)
                print()

In [50]:
import copy

def result_iterator(result, mapping):
    for wf in mapping:
        for code in result[wf['wordform']]:
            if code is not None: # ignore incomplete codes for now
                c = copy.copy(code)
                
                c['wordform_id'] = wf['wordform_id']
                
                # we don't need the wordform
                del c['wordform']
                yield(c)

In [51]:
for i, obj in enumerate(result_iterator(result, mapping)):
    if i != 0 and i % 10 == 0:
        break
    print(obj)

{'Z': 1, 'Y': 1, 'X': 23, 'W': 4, 'V': 1, 'word_type_code': 'HDU003', 'wordform_id': 338053}
{'Z': 1, 'Y': 1, 'X': 22, 'W': 101, 'V': 13, 'word_type_code': 'HDU011', 'wordform_id': 338053}
{'Z': 1, 'Y': 3, 'X': 384, 'W': 181, 'V': 10, 'word_type_code': 'HDU061', 'wordform_id': 338054}
{'Z': 1, 'Y': 3, 'X': 555, 'W': 209, 'V': 1, 'word_type_code': 'HDU011', 'wordform_id': 338054}
{'Z': 1, 'Y': 1, 'X': 993, 'W': 275, 'V': 8, 'word_type_code': 'HDU035', 'wordform_id': 338054}
{'Z': 1, 'Y': 3, 'X': 1016, 'W': 267, 'V': 1, 'word_type_code': 'HDU011', 'wordform_id': 338055}
{'Z': 1, 'Y': 3, 'X': 1016, 'W': 267, 'V': 5, 'word_type_code': 'HDU026', 'wordform_id': 338055}
{'Z': 1, 'Y': 3, 'X': 2433, 'W': 1475, 'V': 1, 'word_type_code': 'HDU011', 'wordform_id': 338055}
{'Z': 1, 'Y': 3, 'X': 906, 'W': 439, 'V': 7, 'word_type_code': 'HDU047', 'wordform_id': 338056}
{'Z': 1, 'Y': 1, 'X': 29, 'W': 470, 'V': 1, 'word_type_code': 'HDU016', 'wordform_id': 338056}


In [58]:
# insert the contents of the file with the json lines
from collections import defaultdict

from ticclat.ticclat_schema import MorphologicalParadigm
from ticclat.sacoreutils import sql_insert_batches

filtered = defaultdict(dict)

with session_scope(Session) as session:
    with get_temp_file() as mp_file:
        for paradigm in result_iterator(result, mapping):
            filtered['{}-{}-{}-{}-{}'.format(paradigm['Z'],
                                                   paradigm['Y'],
                                                   paradigm['X'],
                                                   paradigm['W'],
                                                   paradigm['V'])] = paradigm
        
        t = write_json_lines(mp_file, filtered.values())
        print(f'Wrote {t} morphological variants.')
        sql_insert_batches(session, MorphologicalParadigm, read_json_lines(mp_file), batch_size=50000)

0it [00:00, ?it/s]

Wrote 359319 morphological variants.


359319it [00:15, 22817.53it/s]


In [65]:
from ticclat.dbutils import add_morphological_paradigms

in_file = '/Users/jvdzwaan/Downloads/CombilexTypolistINThistlex.TICCLATingest.DeriveParadigms265.tsv'

with session_scope(Session) as session:
    add_morphological_paradigms(session, in_file)

2019-06-27 13:17:03,392 [965] INFO     ticclat.dbutils,82	Bulk adding wordforms.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  wfs['wordform'] = wfs['wordform'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  wfs.dropna(subset=['wordform'], inplace=True)
 98%|█████████▊| 550000/559827 [00:21<00:00, 26504.11it/s]
2019-06-27 13:17:27,746 [965] INFO     ticclat.dbutils,125	0 wordforms have been added.
2019-06-27 13:18:36,480 [965] INFO     ticclat.dbutils,508	Wrote 359319 morphological variants.
0it [00:00, ?it/s]


IntegrityError: (MySQLdb._exceptions.IntegrityError) (1062, "Duplicate entry '1-1-23-4-1' for key 'PRIMARY'")
[SQL: INSERT INTO morphological_paradigms (`Z`, `Y`, `X`, `W`, `V`, wordform_id, word_type_code) VALUES (%s, %s, %s, %s, %s, %s, %s)]
[parameters: ((1, 1, 23, 4, 1, 897740, 'HDU006'), (1, 1, 22, 101, 13, 897740, 'HDU011'), (1, 3, 384, 181, 10, 874236, 'HDU049'), (1, 3, 555, 209, 1, 874236, 'HDU011'), (1, 1, 993, 275, 8, 874236, 'HDU035'), (1, 3, 1016, 267, 1, 897882, 'HDU010'), (1, 3, 1016, 267, 5, 897404, 'HDU019'), (1, 3, 2433, 1475, 1, 897404, 'HDU011')  ... displaying 10 of 50000 total bound parameter sets ...  (1, 7, 2491, 21885, 2, 401149, 'HCM'), (9, 1, 2616, 73875, 1, 401150, 'HCL'))]
(Background on this error at: http://sqlalche.me/e/gkpj)

In [None]:
/Users/jvdzwaan/code/ticclat/ticclat/notebooks/ENVVARS.txt