In [None]:
%load_ext autoreload
%autoreload 2

To be able to query the database, make a file ``ENVVARS.txt`` in the notebooks directory, and add the following contents (fill in the blanks):

```
user=...
password=...
dbname=...

```

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
import MySQLdb
import textwrap

import pandas as pd


def do_query(query):
    cn = MySQLdb.connect(host='localhost', 
                         port=3306,
                         user=os.environ.get('user'), 
                         passwd=os.environ.get('password'),
                         db=os.environ.get('dbname'))
    df_mysql = pd.read_sql(query, con=cn)    
    cn.close()
    # deduplicate columns
    df_mysql = df_mysql.loc[:,~df_mysql.columns.duplicated()]
    return df_mysql

tables = do_query('SHOW TABLES;')
tables

In [None]:
do_query('SELECT COUNT(*) FROM analyzed_wordforms;')

In [None]:
do_query('SELECT COUNT(*) FROM wordforms;')

In [None]:
do_query('SELECT COUNT(*) from documents;')

In [None]:
do_query('SELECT * from corpora;')

There are more documents in the database than wordforms/analyzed wordforms? That is strange...

In [None]:
do_query('SELECT * FROM wordforms ORDER BY RAND() LIMIT 10;')

In [None]:
do_query('SELECT * FROM wordforms WHERE has_analysis != null LIMIT 10;')

In [None]:
do_query('SELECT * FROM analyzed_wordforms ORDER BY RAND() LIMIT 10;')

In [None]:
do_query('SELECT * FROM analyzed_wordforms WHERE verified_by IS NULL;')

In [None]:
do_query('SELECT * FROM wordforms WHERE has_analysis IS NOT NULL LIMIT 10;')

In [None]:
do_query('SELECT * FROM documents ORDER BY RAND() LIMIT 10;')

In [None]:
do_query('SELECT * FROM documents WHERE pub_year IS NOT NULL LIMIT 10;')

In [None]:
do_query('SELECT * FROM documents WHERE word_count IS NOT NULL LIMIT 10;')

In [None]:
query = """
SELECT * 
FROM analyzed_wordforms
INNER JOIN wordforms ON analyzed_wordforms.wordform_id = wordforms.wordform_id
"""
do_query(query)

In [None]:
do_query('SELECT COUNT(*) FROM token_attestations;')

In [None]:
do_query('SELECT COUNT(*) FROM token_attestation_verifications;')

In [None]:
%%time
# Which words occur in which documents?

query = """
SELECT * 
FROM wordforms
LEFT JOIN analyzed_wordforms ON analyzed_wordforms.wordform_id = wordforms.wordform_id
LEFT JOIN token_attestations ON token_attestations.analyzed_wordform_id = analyzed_wordforms.analyzed_wordform_id
LEFT JOIN documents ON documents.document_id = token_attestations.document_id
"""
tokens_in_documents = do_query(query)

In [None]:
tokens_in_documents[['wordform', 'analyzed_wordform_id', 'document_id']]

In [None]:
tokens_in_documents = tokens_in_documents.loc[:,~tokens_in_documents.columns.duplicated()]

In [None]:
tokens_in_documents.columns

In [None]:
token_counts = tokens_in_documents.groupby('document_id').count()

In [None]:
token_counts.sort_values('wordform')

The lexicon seems to contain very short documents only (up to 16 words).

## Paradigmatic expansions

In [None]:
wfs = do_query('SELECT * FROM analyzed_wordforms')

In [None]:
pos = wfs.groupby('part_of_speech').count()

In [None]:
pos.loc['CLITIC']

In [None]:
pos_tags = list(pos.index)
pos_tags.sort()
print(pos_tags)

The database does not seem to follow documentation on using pos tag CLITIC for wordforms that consist of multiple words (e.g., 'kzag).

In [None]:
do_query('SELECT * FROM analyzed_wordforms WHERE lemma_id is NULL;')

In [None]:
do_query('SELECT * FROM analyzed_wordforms WHERE multiple_lemmata_analysis_id IS NOT NULL;')

In [None]:
do_query('SELECT * FROM analyzed_wordforms WHERE multiple_lemmata_analysis_id != 0;')

Hypothesis: if `multiple_lemmata_analysis_id` is 0 (instead of `NULL`), the wordform does not have clitics.

In [None]:
%%time
# Find all forms of a lemma (ignore wordforms that consist of multiple lemmas)

query = """
SELECT * 
FROM wordforms
LEFT JOIN analyzed_wordforms ON analyzed_wordforms.wordform_id = wordforms.wordform_id
LEFT JOIN lemmata ON analyzed_wordforms.lemma_id = lemmata.lemma_id
"""
lemmaforms = do_query(query)

In [None]:
lemmaforms

In [None]:
lemmaforms.query('lemma_id == 243876')

Issue: when you expand a lemma, you get a lot of wordforms that are only a part of a word. For example, the lemma *aankijken* (id: 219562) contains words like *aan* and *kijken*. The pos tag specifies whether it is a `prefixPart` 
or a `mainPart`. This is strange, why put this in the pos tag field?

The lemma also has a pos tag field that specifies that it is a verb (in this case).

Is it possible to determine which parts belong together? I don't think so... It also doesn't seem relevant, because we are mainly interested in token-level things (the parts are treated as different tokens from the perspective of ticcl).

In [None]:
lemma_ids = set(lemmaforms['lemma_id'])

In [None]:
print(len(lemma_ids))

In [None]:
%%time
# which lemmas are associated with a wordform?
lemmaforms.groupby('wordform_id').count()

In [None]:
lemmaforms.query('wordform_id == 528978')

In [None]:
%%time
# remove parts from lemmaforms
def is_part(row):
    if row['part_of_speech'] is not None:
        return row['part_of_speech'].isupper() and not row['part_of_speech'].endswith('_CONTRACT')
    return False

complete_lemmaforms = lemmaforms[lemmaforms.apply(lambda row: is_part(row), axis=1)]

In [None]:
complete_lemmaforms

## Morphological analysis

The lexicon does not seem to contain morphological analyses or rules for automatically expanding words/inflections.

In [None]:
query = """
SELECT * FROM morphological_operations
"""
do_query(query)

In [None]:
query = """
SELECT * FROM morphological_analyses
"""
do_query(query)

In [None]:
query = """
SELECT * FROM part_morphological_analysis
"""
do_query(query)

## Morphosintactic conversion 

Morphosyntactic conversion: transcategorization

In [None]:
query = """
SELECT * FROM conversion_rules
"""
do_query(query)

In [None]:
query = """
SELECT * FROM transcategorizations
"""
do_query(query)

In [None]:
query = """
SELECT * FROM transformsets
"""
do_query(query)

## Orthographic information

From wikipedia: An orthography is a set of conventions for writing a language. It includes norms of spelling, hyphenation, capitalization, word breaks, emphasis, and punctuation. (https://en.wikipedia.org/wiki/Orthography)

Also not present in the lexicon.

In [None]:
query = """
SELECT * FROM derivations
"""
do_query(query)

In [None]:
query = """
SELECT * FROM pattern_applications
"""
do_query(query)

In [None]:
query = """
SELECT * FROM patterns
"""
do_query(query)