In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm_notebook as tqdm

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()

In [None]:
#db_name = 'ticclat'
db_name = 'ticclat_test'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, WordformLink, WordformLinkSource, lexical_source_wordform

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
%%time

# select wordforms that occur in at least 2 lexica
from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func

with session_scope(Session) as session:
    subq = select([Wordform, func.count('lexicon_id').label('num_lexicons')]).select_from(lexical_source_wordform.join(Wordform)) \
        .group_by(Wordform.wordform_id)
    q = select(['*']).select_from(subq.alias()).where(text('num_lexicons >= 2'))
    print(q)
    
    r = session.execute(q) #.filter(subq.c.num_lexica > 1)
    for row in r.fetchall():
        print(row)
        print(row['wordform'], row['num_lexicons'])
        print()

In [None]:
# select wordforms that occur in at least 2 lexica that are vocabularies (so only contain correct wordforms)
from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func

with session_scope(Session) as session:
    subq = select([Wordform, func.count('lexicon_id').label('num_lexicons')]).select_from(lexical_source_wordform.join(Wordform).join(Lexicon)) \
        .where(Lexicon.vocabulary == True).group_by(Wordform.wordform_id)
    q = select(['*']).select_from(subq.alias()).where('num_lexicons > 1')
    print(q)
    
    r = session.execute(q) #.filter(subq.c.num_lexica > 1)
    for row in r.fetchall():
        print(row)
        print(row['wordform'], row['num_lexicons'])
        print()

In [None]:
%%time
# Get all the wordforms in a corpus
from sqlalchemy import select
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

with session_scope(Session) as session:
    q = select([Wordform.wordform_id,Wordform.wordform, Corpus.name]).select_from(
        Corpus.__table__.join(corpusId_x_documentId).join(Document).join(TextAttestation).join(Wordform)
    ).distinct()
    r = session.execute(q)
    for wf in r:
        print(wf)
    

In [None]:
%%time
# count the unique wordforms
from sqlalchemy import select
from sqlalchemy.sql import func, distinct

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

name = 'SoNaR-500'

with session_scope(Session) as session:
    q = select([func.count(distinct(Wordform.wordform_id))]).select_from(
        Corpus.__table__.join(corpusId_x_documentId).join(Document).join(TextAttestation).join(Wordform)
    ).where(Corpus.name == name)
    r = session.execute(q)
    for wf in r:
        print(wf)

In [None]:
# get all the wordforms in a lexicon
name = 'l2'

with session_scope(Session) as session:
    q = select([Wordform.wordform]).select_from(
        Lexicon.__table__.join(lexical_source_wordform).join(Wordform)
    ).where(Lexicon.lexicon_name == name)
    r = session.execute(q)
    for wf in r:
        print(wf)

In [None]:
# select wordforms that occur in a lexicon and corpus
from sqlalchemy.sql import intersect, and_
# mysql does not have intersect

with session_scope(Session) as session:
    x = Wordform.__table__.alias('x')
    name1 = 'l2'
    name2 = 'corpus1'
    q1 = select([Wordform]).select_from(
            Wordform.__table__.join(lexical_source_wordform).join(Lexicon).join(TextAttestation, TextAttestation.wordform_id==Wordform.wordform_id).join(Document).join(corpusId_x_documentId).join(Corpus)
        ).where(and_(Lexicon.lexicon_name == name1, Corpus.name == name2)).distinct()
    
    print(q1)

    
    #y = Wordform.__table__.alias('y')
    #name = 'corpus1'
    #q2 = select([y]).select_from(
    #        Corpus.__table__.join(corpusId_x_documentId).join(Document).join(TextAttestation).join(Wordform)
    #    ).where(Corpus.name == name).distinct()
    
    #print(q1.join(TextAttestation, x.c.wordform_id == TextAttestation.wordform_id).join(Document).join(corpusId_x_documentId).join(Corpus))
        
    r = session.execute(q1).fetchall()
    print(r)
    #r = session.execute(q2).fetchall()
    #print(r)
    
    #r = session.execute(intersect(q1, q2)).fetchall()
    #print(r)
    



In [None]:
%%time
# (aantal) wordforms per document in bepaald corpus

from sqlalchemy import select
from sqlalchemy.sql import func, distinct

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

corpus_name = 'corpus1'

with session_scope(Session) as session:
    q = select([Wordform, Document.title]) \
        .select_from(
            Corpus.__table__.join(corpusId_x_documentId).join(Document)
            .join(TextAttestation).join(Wordform)
        ).where(Corpus.name == corpus_name).group_by(Document.title, Wordform.wordform_id)
    r = session.execute(q).fetchall()
    print(r)

In [None]:
%%time
# (aantal) wordforms per document in bepaald corpus

from sqlalchemy import select
from sqlalchemy.sql import func, distinct

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

corpus_name = 'SoNaR-500'

with session_scope(Session) as session:
    q = select([Document.title, func.count(distinct(Wordform.wordform_id)).label('tot_freq')]) \
        .select_from(
            Corpus.__table__.join(corpusId_x_documentId).join(Document)
            .join(TextAttestation).join(Wordform)
        ).where(Corpus.name == corpus_name).group_by(Document.title)
    print(q)
    wf_doc = pd.read_sql(q, session.bind)
    #r = session.execute(q).fetchall()
    #print(r)
print(wf_doc)

In [None]:
wf_doc = wf_doc.set_index('title')
wf_doc

In [None]:
%%time
# (aantal) wordforms per document in bepaald corpus en bepaald lexicon

from sqlalchemy import select
from sqlalchemy.sql import func, distinct, and_

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

corpus_name = 'SoNaR-500'
lexicon_name = 'GB95-05_002.csv.alltokens.utf8.nopunct'

with session_scope(Session) as session:
    q = select([Document.title, func.count(distinct(Wordform.wordform_id)).label('lexicon_freq')]) \
        .select_from(
            Corpus.__table__.join(corpusId_x_documentId).join(Document)
            .join(TextAttestation).join(Wordform).join(lexical_source_wordform).join(Lexicon)
        ).where(and_(Corpus.name == corpus_name, Lexicon.lexicon_name == lexicon_name)).group_by(Document.title)
    print(q)
    wf_l_doc = pd.read_sql(q, session.bind)
    #r = session.execute(q).fetchall()
    #print(r)
print(wf_l_doc)

In [None]:
wf_l_doc = wf_l_doc.set_index('title')
wf_l_doc

In [None]:
data = pd.concat([wf_doc, wf_l_doc], axis=1)
data

In [None]:
data['%_lexicon_wordforms'] = data['lexicon_freq']/data['tot_freq']*100

In [None]:
data

In [None]:
from tabulate import tabulate

print(tabulate(data, headers=['text_type', '#wordforms', '#wordforms in GB1995/2005', '%overlap'], tablefmt="github"))

In [None]:
%%time
# lijst met lexicons en aantal woordvormen per lexicon
from sqlalchemy import select
from sqlalchemy.sql import func, distinct, and_

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus, \
    WordformLink, WordformLinkSource, lexical_source_wordform, corpusId_x_documentId, \
    TextAttestation

with session_scope(Session) as session:
    q = select([Lexicon.lexicon_name, func.count(distinct(Wordform.wordform_id)).label('num_wordforms')]) \
        .select_from(
            Wordform.__table__.join(lexical_source_wordform).join(Lexicon)
        ).group_by(Lexicon.lexicon_name)
    print(q)
    r = session.execute(q).fetchall()
    print(r)

In [None]:
%%time
# anahashes with number of wordforms
from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func, desc

with session_scope(Session) as session:
    subq = select([Anahash, func.count('wordform_id').label('num_wf')]).select_from(Anahash.__table__.join(Wordform)) \
        .group_by(Anahash.anahash_id)
    q = select(['*']).select_from(subq.alias()).where(text('num_wf > 1')).order_by(desc('num_wf'))
    print(q)
    
    r = session.execute(q) #.filter(subq.c.num_lexica > 1)
    for row in r.fetchall():
        print(row)
        break
       

In [None]:
%%time
cf_file = '/home/jvdzwaan/data/ticclat/ticcl/nld.aspell.dict.c20.d2.confusion'

cfs = []

with open(cf_file) as f:
    for line in f:
        cf, _ = line.split('#')
        cfs.append(int(cf))
print(len(cfs))

In [None]:
%%time
# gegeven een woord, geef alle woorden in de db die 'dichtbij' zijn (1 character confusion verschil)

from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func, desc

word = 'koelkast'

with session_scope(Session) as session:
    q = select([Wordform, Anahash]).select_from(Wordform.__table__.join(Anahash)).where(Wordform.wordform == word)

    r = session.execute(q)
    wf = r.fetchone()
    anahash = wf['anahash']
    print(wf, anahash)

In [None]:
%%time
results = []
with session_scope(Session) as session:
    for v in cfs:
        av = anahash + v
        q = select([Wordform, Anahash]).select_from(Wordform.__table__.join(Anahash)).where(Anahash.anahash == av)
        for row in session.execute(q).fetchall():
            results.append(row)    

In [None]:
print(len(results))

In [None]:
print(results[343])

In [None]:
%%time
# Given a wordform, give word frequencies per year (term frequency and document frequency)
# Seems useful to optionally select a corpus or corpora
from ticclat.ticclat_schema import TextAttestation, corpusId_x_documentId

from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func, desc, and_

word = 'wf2'
corpus_name = 'corpus1'

with session_scope(Session) as session:
    q = select([Wordform.wordform_id, Wordform.wordform, Document.pub_year, func.count(Document.document_id).label('document_frequency'), func.sum(TextAttestation.frequency).label('term_frequency')]).select_from(
        Corpus.__table__.join(corpusId_x_documentId, Corpus.corpus_id == corpusId_x_documentId.c.corpus_id).join(Document, Document.document_id == corpusId_x_documentId.c.document_id).join(TextAttestation).join(Wordform)
    ).where(and_(Wordform.wordform == word, Corpus.name == corpus_name)).group_by(Document.pub_year, Wordform.wordform, Wordform.wordform_id)
    #q = select(['wordform', 'name', func.sum('frequency').label('freq')]).select_from(subq.alias()).group_by('name')
    print(q)
    
    r = session.execute(q)
    for row in r.fetchall():
        #print(row['name'], row['corpus_frequency'])
        print(row)


In [None]:
%%time
# Given a wordform, in what corpora does it occur, with what frequencies (term frequency and document frequency)

from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.sql import func, desc

word = 'wf2'

with session_scope(Session) as session:
    q = select([Wordform.wordform_id,Wordform.wordform, Corpus.name, func.count(Document.document_id).label('document_frequency'), func.sum(TextAttestation.frequency).label('term_frequency')]).select_from(
        Corpus.__table__.join(corpusId_x_documentId, Corpus.corpus_id == corpusId_x_documentId.c.corpus_id).join(Document, Document.document_id == corpusId_x_documentId.c.document_id).join(TextAttestation).join(Wordform)
    ).where(Wordform.wordform == word).group_by(Corpus.name, Wordform.wordform, Wordform.wordform_id)
    print(q)
    
    r = session.execute(q)
    for row in r.fetchall():
        print(row)