In [1]:
import re
import sqlite3 

from schema import Schema

from lexupdater.db_handler import DatabaseUpdater, ruleset_schema, INSERT_STMT, UPDATE_QUERY
from lexupdater.dialect_updater import parse_constraints, parse_exemptions, map_rule_exemptions


In [2]:
UPDATE_QUERY

'UPDATE {dialect} SET nofabet = REGREPLACE(?,?,nofabet) {where_word_in_stmt};'

In [3]:
SELECT_QUERY = (
        "SELECT wordform FROM {table_name} "
        "WHERE REGEXP(?,nofabet) "
        "{where_word_in_stmt};"
    )

In [10]:
# Det bør skrives ut lister med ord som matcher søkemønstrene
# i regelsettene i regelfilene, én liste per dialekt.

def construct_select_queries(dialects, filter_string):
    """Create a select query to retrieve all entries covered by the rule pattern."""
    for dialect in dialects: 
        query = SELECT_QUERY.format(table_name=dialect, where_word_in_stmt=filter_string)
        yield query
        
def parse_rules(rulesets, exemptions): 
    """Parse the ruleset, fetch constraints and exemptions. 
    
    Basically a copy-pasta of construct_update_queries, 
    so make sure to refactor that later. 
    """
    rule_exemptions = map_rule_exemptions(exemptions)

    for ruleset in rulesets:
        rule_name = ruleset["name"]
        rule_dialects = ruleset["areas"]
        if not rule_dialects:
            continue

        exempt_words = rule_exemptions.get(rule_name, [])
        exempt_str = parse_exemptions(exempt_words)

        for rule in ruleset["rules"]:

            constraints = rule["constraints"]
            is_constrained = bool(constraints)
            constraint_str, constraint_values = parse_constraints(
                constraints)

            values = [rule["pattern"], rule["repl"]]
            values += constraint_values + exempt_words

            if not is_constrained and not exempt_words:
                where_word_in_stmt = ""
            else:
                where_word_in_stmt = WHERE_WORD_IN_STMT.format(
                    word_table=self._word_table,
                    constraints=constraint_str,
                    exemptions=(
                        f" AND {exempt_str}"
                        if is_constrained and exempt_str
                        else exempt_str
                    )
                )

            yield construct_select_queries(rule_dialects, where_word_in_stmt)

In [11]:
def create_new_ruleset(ruleset_name: str, rule_dialects: list, rule_patterns: list): 
    return ruleset_schema.validate({
        "name": ruleset_name, 
        "areas": rule_dialects,
        "rules": rule_patterns,
    })


def create_new_rule_pattern(rule_pattern: str, replacement: str, constraints: list): 
    return {
        'pattern': rule_pattern, 
        'repl': replacement, 
        'constraints': Schema([{
            'field': str, 
            'pattern': str, 
            'is_regex': bool
        }]).validate(constraints)
    }

## Eksempel 
Tatt fra feilsøkingsdokumentet om [/M RNX0/-sekvensen](https://docs.google.com/document/d/1zEX49CwOC8oo8oaVLAB1s3L1wM3vFoTp7vkaRN1v3n4/edit?usp=sharing) (takk til Marie for et illustrativt eksempel!): 

    Vi må 
    * Endre den feilaktige trankripsjonen /M RNX0 (AX0)/ til /M AX0 RN (AX0)/.
    * Legge til /M AX0 RN (AX0)/-varianten for ordene som kun har /M R AX0 N (AX0)/-varianten.
    * Endre den feilaktige transkripsjonen /M AX0 RNX0 (AX0)/ til /M AX0 RN (AX0)/. 

---
TIPS:
Jeg anbefaler å bruke regex101.com til selve utviklingen av regex-mønstrene, og å tikke av `Flavor` "Python2.7" og `Function` "substitution" i menyen til venstre. F.eks. https://regex101.com/r/Dyhrob/1

In [12]:
# Endre den feilaktige trankripsjonen /M RNX0 (AX0)/ til /M AX0 RN (AX0)/
correction_rule = create_new_rule_pattern(
    rule_pattern=r"\bM RNX0( AX0|\b)?", 
    replacement=r"M AX0 RN\1", 
    constraints=[]
)

# Endre den feilaktige transkripsjonen /M AX0 RNX0 (AX0)/ til /M AX0 RN (AX0)/
correction_rule2 = create_new_rule_pattern(
    rule_pattern=r"\bM AX0 RNX0( AX0|\b)?",
    replacement=r"M AX0 RN\1",
    constraints=[]
)

#Lag et nytt regel-sett for feilrettinger
corrections_ruleset = create_new_ruleset(
    ruleset_name="corrections", 
    rule_dialects=["e_spoken", "e_written"],
    rule_patterns=[correction_rule, correction_rule2]
)


# Legge til /M AX0 RN (AX0)/-varianten for ordene som kun har /M R AX0 N (AX0)/-varianten
addition_rule = create_new_rule_pattern(
    rule_pattern=r"\bM R AX0 N( AX0|\b)?",
    replacement=r"M AX0 RN\1",
    constraints=[]
)

# Lag et nytt regelsett for tillegg 
additions_ruleset = create_new_ruleset(
    ruleset_name="additions", 
    rule_dialects=["e_spoken", "e_written"],
    rule_patterns=[addition_rule]
)

In [14]:
queries = parse_rules([corrections_ruleset, additions_ruleset], [])
for query in queries: 
    for items in query: 
        print(items)

SELECT wordform FROM e_spoken WHERE REGEXP(?,nofabet) ;
SELECT wordform FROM e_written WHERE REGEXP(?,nofabet) ;
SELECT wordform FROM e_spoken WHERE REGEXP(?,nofabet) ;
SELECT wordform FROM e_written WHERE REGEXP(?,nofabet) ;
SELECT wordform FROM e_spoken WHERE REGEXP(?,nofabet) ;
SELECT wordform FROM e_written WHERE REGEXP(?,nofabet) ;


In [15]:
# TODO: split up the parse_rules function 
# TODO: apply queries on the database and look at the results 