In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
import logging

logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s "
                    "%(name)s,%(lineno)s\t%(message)s")
logging.getLogger().setLevel('INFO')

In [None]:
# Read information to connect to the database and put it in environment variables
import os
with open('../ENVVARS.txt') as f:
    for line in f:
        parts = line.split('=')
        if len(parts) == 2:
            os.environ[parts[0]] = parts[1].strip()
            
os.environ['lexicon_name'] = os.environ['dbname']

In [None]:
db_name = 'ticclat'
os.environ['dbname'] = db_name

In [None]:
from ticclat.ticclat_schema import Lexicon, Wordform, Anahash

from ticclat.dbutils import get_session, session_scope

Session = get_session(os.environ['user'], os.environ['password'], os.environ['dbname'])

In [None]:
from sqlalchemy import select, text
from sqlalchemy.sql import func, distinct, and_, desc

from ticclat.ticclat_schema import MorphologicalParadigm, TextAttestation, Document, corpusId_x_documentId, Corpus

update_data = []

with session_scope(Session) as session:
    q = select([Document.document_id]).select_from(corpusId_x_documentId.join(Corpus).join(Document)).where(Corpus.name == 'SoNaR-500')
    r = session.execute(q).fetchall()
    for row in r:
        #print(row)
        update_data.append({'doc_id': row[0]})
update_data

In [None]:
from sqlalchemy import update
from sqlalchemy.sql.expression import bindparam

from ticclat.ticclat_schema import Lexicon, Wordform, Anahash, Document, Corpus

with session_scope(Session) as session:
    q = update(Document).values(year_from=1950, year_to=2010) \
        .where(Document.document_id.in_(select('*').select_from(select([Document.document_id]) \
                                        .select_from(corpusId_x_documentId.join(Corpus).join(Document)) \
                                        .where(Corpus.name == 'SoNaR-500').alias('test')).as_scalar()))
    print(q)
    #session.execute(q, update_data)
    session.execute(q)

In [None]:
with session_scope(Session) as session:
    q = update(Document).values(year_from=None, year_to=None) \
        .where(Document.document_id.in_(select('*').select_from(select([Document.document_id]) \
                                        .select_from(corpusId_x_documentId.join(Corpus).join(Document)) \
                                        .where(Corpus.name == 'SoNaR-500').alias('test')).as_scalar()))
    print(q)
    #session.execute(q, update_data)
    session.execute(q)

In [None]:
%%time
# replace _ in wordforms with *
import MySQLdb

with session_scope(Session) as session:
    q = select([Wordform]).where(Wordform.wordform.like('%\_%'))
    print(q)
    r = session.execute(q).fetchall()
    for row in r:
        #print(row)
        try:
            q = update(Wordform).values(wordform=row['wordform'].replace('_', '*'),
                                        wordform_lowercase=row['wordform_lowercase'].replace('_', '*')).where(Wordform.wordform_id == row['wordform_id'])
            session.execute(q)
        except (MySQLdb._exceptions.IntegrityError, IntegrityError):
            pass
        

In [None]:
# replace _ in wordforms with *
with session_scope(Session) as session:
    q = select([Wordform]).where(Wordform.wordform.like('% %')).limit(10)
    print(q)
    r = session.execute(q).fetchall()
    for row in r:
        print(row)
        q = update(Wordform).values(wordform=row['wordform'].replace(' ', '_'),
                                    wordform_lowercase=row['wordform_lowercase'].replace(' ', '_')).where(Wordform.wordform_id == row['wordform_id'])
        session.execute(q)

In [None]:
wfs = ['space bla', 'an other', np.NaN]
df = pd.DataFrame(wfs, columns=['wordform'])
df

In [None]:
df['wordform'] = df['wordform'].str.replace(' ', '_')
df

In [None]:
# delete corpus
