# Solução simbólica

## Preparação do dicionário

In [None]:
import csv
from typing import TypedDict
from tqdm import tqdm

In [None]:
class WordData(TypedDict):
    raiz: set[str]
    tag: set[str]
    features: set[str]


class Node():
    def __init__(self, head: str, data: dict | None = None, prox: dict[str, 'Node'] | None = None):
        self.head = head
        self.data = data
        self.prox = {} if prox is None else prox


class ReGra():
    def __init__(self):
        self.nodes: dict[str, Node] = {}

    def dive(self, current_node: Node, word: str, depth: int):
        # Palavra não existe ou palavra encontrada
        if depth >= len(word) or word[depth] not in current_node.prox.keys():
            return current_node

        # Continua a recursão
        return self.dive(current_node.prox[word[depth]], word, depth + 1)

    def get_parent(self, word: str) -> Node | None:
        if word[0] not in self.nodes.keys():
            return None

        return self.dive(self.nodes[word[0]], word, 1)

    def __getitem__(self, word: str):
        if word[0] not in self.nodes.keys():
            return None

        node = self.dive(self.nodes[word[0]], word, 1)

        if node.head != word:
            return None

        return node.data

    def __setitem__(self, idx: str, value: WordData):
        # #print()
        # print()
        parent = self.get_parent(idx)

        # Se o conjunto está vazio ou uma letra base não existe
        if parent is None:
            if len(idx) == 1:
                self.nodes[idx] = Node(head=idx, data=value)
                return

            parent = Node(head=idx[0])
            self.nodes[idx[0]] = parent

            for i in range(1, len(idx) - 1):
                parent.prox[idx[i]] = Node(head=idx[:i + 1])
                parent = parent.prox[idx[i]]

            parent.prox[idx[-1]] = Node(head=idx, data=value)
            return

        # Se a palavra existe, atualiza os dados
        if parent.head == idx:
            if parent.data is None:
                parent.data = value

            else:
                parent.data['raiz'] = parent.data['raiz'].union(value['raiz'])
                parent.data['tag'] = parent.data['tag'].union(
                    value['tag'])
                parent.data['features'] = parent.data['features'].union(
                    value['features'])

            return

        # Se a palavra não existe, mas é filha de uma que existe
        # cria o nó dela
        if len(parent.head) + 1 == len(idx):
            parent.prox[idx[-1]] = Node(head=idx, data=value)

        # Se a palavra não existe, e não é filha de uma que existe
        # cria um caminho de nós até ela
        else:
            for i in range(len(parent.head), len(idx) - 1):
                parent.prox[idx[i]] = Node(head=idx[:i + 1])
                parent = parent.prox[idx[i]]

            parent.prox[idx[-1]] = Node(head=idx, data=value)

    def __repr__(self) -> str:
        string = ''
        for node in self.nodes.values():
            string += self.build_tree(node)

        return string

    def build_tree(self, current: Node, depth: int = 0):
        string = f"{'-' * depth} {current.head}{' OK' if current.data is not None else ''}\n"

        for node in current.prox.values():
            string += self.build_tree(node, depth + 1)

        return string

    def get_children(self, current: Node, max_depth: int = 10, depth: int = 0) -> list[str]:
        if depth >= max_depth:
            return []

        children = []

        for node in current.prox.values():
            if node.data is not None:
                children.append(node.head)

            children += self.get_children(node, max_depth=max_depth, depth=depth + 1)

        return children

In [None]:
dicionary_path = "./portilexicon-ud.tsv"
dictionary = ReGra()

with open(dicionary_path) as dic:
    rd = csv.reader(dic, delimiter='\t', quotechar='"')
    qtd_lines = sum(1 for _ in rd)
    dic.seek(0)    
    print('teste')
    
    progress = tqdm(total=qtd_lines, desc='Carregando dicionário')
    
    for row in rd:
        word = row[0]
        raiz = [row[1]]
        tag = [row[2]]
        features = [x for x in row[3].split('|') if x]

        dictionary[word] = WordData(raiz=set(raiz), tag=set(tag), features=set(features))
        
        progress.update(1)
        
    progress.close()
        

## Parsing da Frase

In [None]:
def clean_text(text: str):
    return text.lower().replace(',', '').replace(
        '.', '').replace('!', '').replace('?', '').replace(';', '')

In [None]:
def parsing(phrase: str, dictionary: ReGra):
    phrase = clean_text(phrase)
    
    out = {}
    for word in phrase.split():
        out[word] = dictionary[word]
        
        if out[word] is None:
            out[word] = {'raiz': set(['Unknown']), 'tag': set(['Unknown']), 'features': set(['Unknown'])}
            
    return out

## Geração da Árvore Sintática

A árvore sintática é gerada através do LX-Parser. Para obter um chave da API siga as instruções em:

[https://portulanclarin.net/workbench/lx-parser/](https://portulanclarin.net/workbench/lx-parser/)

In [None]:
import requests

In [None]:
LXPARSER_WS_API_URL = 'https://portulanclarin.net/workbench/lx-parser/api/'

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

In [None]:
# Função fornecida pelos desenvolvedores do LX-Parser

class WSException(Exception):
    'Webservice Exception'
    def __init__(self, errordata):
        "errordata is a dict returned by the webservice with details about the error"
        super().__init__(self)
        assert isinstance(errordata, dict)
        self.message = errordata["message"]
        # see https://json-rpc.readthedocs.io/en/latest/exceptions.html for more info
        # about JSON-RPC error codes
        if -32099 <= errordata["code"] <= -32000:  # Server Error
            if errordata["data"]["type"] == "WebServiceException":
                self.message += f": {errordata['data']['message']}"
            else:
                self.message += f": {errordata['data']!r}"
    def __str__(self):
        return self.message

In [None]:
# Função fornecida pelos desenvolvedores do LX-Parser

def parse(text, format):
    '''
    Arguments
        text: a string with a maximum of 2000 characters, Portuguese text, with
             the input to be processed
        format: either 'parentheses', 'table' or 'JSON'

    Returns a string or JSON object with the output according to specification in
       https://portulanclarin.net/workbench/lx-parser/
    
    Raises a WSException if an error occurs.
    '''

    request_data = {
        'method': 'parse',
        'jsonrpc': '2.0',
        'id': 0,
        'params': {
            'text': text,
            'format': format,
            'key': LXPARSER_WS_API_KEY,
        },
    }
    request = requests.post(LXPARSER_WS_API_URL, json=request_data)
    response_data = request.json()
    if "error" in response_data:
        raise WSException(response_data["error"])
    else:
        return response_data["result"]

In [None]:
frase_teste = "Cachorros quentes quentinhus da aqui"

In [None]:
result_table = parse(frase_teste, format="parentheses")
print(result_table)

In [None]:
import IPython
import svgling
from nltk import Tree

tree: Tree = []
for sentence in result_table.splitlines(keepends=False):
    tree = Tree.fromstring(sentence)
    IPython.display.display(svgling.draw_tree(tree))

### Alinhando o dicionário e o LX-Parser

Como eles utilizam símbolos diferentes, é necessário alinhar as classes antes de prosseguir

In [None]:
symbol_convertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

In [None]:
result = [[x[0], symbol_convertion[x[1]]] for x in tree.pos() if x[1] in symbol_convertion]

lx_parsed = []

for x in result:
    if x[0][-1] == '_':
        x[0] = x[0][:-1]

    if x[0].isalpha():
        lx_parsed.append(x)

In [None]:
dict_parsed = [[x[0], x[1]['tag']] for x in parsing(' '.join([x[0].lower() for x in lx_parsed]), dictionary).items()]

In [None]:
def aligning(dict_parsed, lx_parsed):
    if len(dict_parsed) != len(lx_parsed):
        print(f"Lengths don't match: {len(dict_parsed)} != {len(lx_parsed)}")
        return []

    aligned_classes = []

    for i in range(len(dict_parsed)):
        if dict_parsed[i][0] != lx_parsed[i][0].lower():
            print(f"Words don't match: {dict_parsed[i][0]} != {lx_parsed[i][0]}")
            return []

        word = lx_parsed[i][0]
        classes = dict_parsed[i][1].intersection(set(lx_parsed[i][1]))

        if len(classes) == 0:
            if 'Unknown' in dict_parsed[i][1]:
                classes = set(['ERROR', lx_parsed[i][1][0]])

            else:
                classes = dict_parsed[i][1]

        aligned_classes.append([word, classes])

    return aligned_classes

In [None]:
aligned_tree = aligning(dict_parsed, lx_parsed)

In [None]:
for t in aligned_tree: print(t)

## Sugere as mudanças

### Correção gramatical


In [None]:
def levenshtein_distance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)

    matrix_len = max(len1, len2) + 1
    matrix = [[0 for _ in range(matrix_len)] for _ in range(matrix_len)]

    for i in range(matrix_len):
        matrix[i][0] = i
        matrix[0][i] = i

    for i in range(1, matrix_len):
        for j in range(1, matrix_len):
            matrix[i][j] = min(matrix[i - 1][j], matrix[i][j - 1], matrix[i - 1][j - 1])
            
            if i > len1:
                matrix[i][j] += 1
            elif j > len2:
                matrix[i][j] += 1
            elif word1[i - 1] != word2[j - 1]:
                matrix[i][j] += 1

    return matrix[matrix_len - 1][matrix_len - 1]

In [None]:
def get_corrections(word, tag, dictionary):
    word = clean_text(word)
    p = dictionary.get_parent(word)
    
    print(tag)

    # Procura paralavras com 2 até dois caracteres a mais ou a menos
    if len(p.head) > 2:
        p = dictionary.get_parent(p.head[:-2])

    max_depth = max(len(word) - len(p.head) + 2, 4.0)
    children = dictionary.get_children(p, max_depth=max_depth)

    corrections = []
    for c in children:
        if tag == 'ANY' or tag in dictionary[c]['tag']:
            corrections.append(c)

    corrections.sort(key=lambda x: levenshtein_distance(word, x))
    return corrections[:10]

In [None]:
possible_corrections = {}

for w in aligned_tree:
    if 'ERROR' in w[1]:
        tag = list(w[1] - {'ERROR'})[0]
        possible_corrections[w[0]] = get_corrections(w[0], tag, dictionary)
        
        if len(possible_corrections[w[0]]) == 0:
            possible_corrections[w[0]] = get_corrections(w[0], 'ANY', dictionary)

In [None]:
print(possible_corrections)

### Recorte das melhores soluções

In [None]:
def extract_details(details):
    details_dict = {}

    for x in details:
        if x == '_':
            continue

        char, value = x.split('=')
        if char in details_dict:
            details_dict[char].append(value)
        else:
            details_dict[char] = [value]

    return details_dict

def prune_corrections(corrections, tree: Tree):
    details = set()
    for x in tree.leaves():
        d = dictionary[x]

        if d is not None:
            details = details.union(d['features'])

    same_root = {}
    for x in corrections:
        roots = dictionary[x]['raiz']

        for root in roots:
            if root in same_root:
                same_root[root].append(x)
            else:
                same_root[root] = [x]

    characteristics = extract_details(details)
    pruned_corrections = []
    for _, similar_words in same_root.items():            
        if len(similar_words) == 1:
            pruned_corrections.append(similar_words[0])
            continue
        
        score = {}
        best_combination = 0
        for word in similar_words:
            score[word] = 0
            word_details = extract_details(dictionary[word]['features'])
            
            for char, values in word_details.items():
                if values[0] in characteristics[char]:
                    score[word] += 1
                    
            if best_combination < score[word]:
                best_combination = score[word]
        
        for word in similar_words:
            if score[word] == best_combination:
                pruned_corrections.append(word)

    return pruned_corrections

In [None]:
for word, corrections in possible_corrections.items():
    print(prune_corrections(corrections, tree))

## Define uma classe para correção

In [None]:
from corretor import Corretor

In [None]:
dicionary_path = "./portilexicon-ud.tsv"

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

symbol_convertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

In [None]:
teste_corretor = Corretor(dicionary_path)
teste_corretor.setup_key(LXPARSER_WS_API_KEY)
teste_corretor.setup_symbols(symbol_convertion)

In [None]:
teste_corretor.corrigir_texto("Cachorros quentes quentinhus da aqui")

## Avaliação da solução

In [None]:
import pandas as pd
import random
import string

In [None]:
df = pd.read_csv('./noticias.brwac.csv').drop(['id', 'title', 'uri'], axis=1)
df

In [None]:
def array(l, *args, **kwargs):
  return l

def prepare_text(df, seed=42):
  random.seed(seed)

  sites_text = [eval(x) for x in df['text'].tolist()]
  texts = [[p for p in paragraphs['paragraphs'] if isinstance(p, list)] for paragraphs in sites_text]
  
  texts = [x for paragraphs in texts for phrases in paragraphs for x in phrases if len(x) > 15]

  texts = random.sample(texts, 150)

  return texts

In [None]:
texts = prepare_text(df)
texts[0]

In [None]:
def check_validity(words):
    for word in words:
        if any(char.isdigit() for char in word) or len(word) < 4:
            return False
    return True


def corrupt_text(text, number=1, seed=42):
    random.seed(seed)
    
    # Remove pontuação
    words = text.translate(str.maketrans('', '', string.punctuation.replace('-', ''))).split()
    original_words = random.choices(words, k=number)
    
    i = 0
    while not check_validity(original_words):
        if i > 100:
            raise RuntimeError('Não há palavras válidas o suficiente')
        original_words = random.choices(words, k=number)
        i += 1

    corruptions = []
    for word in original_words:
        corrupt_word = list(word)

        corrupt_word[random.randint(
            0, len(corrupt_word) - 1)] = random.choice(string.ascii_letters)
        corruptions.append([word, ''.join(corrupt_word)])

    return corruptions

In [None]:
corruptions = {}
for text in texts:
    corruptions[text] = corrupt_text(text, number=1)

In [None]:
from corretor import Corretor

dicionary_path = "./portilexicon-ud.tsv"

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

symbol_convertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

In [None]:
teste_corretor = Corretor(dicionary_path, key=LXPARSER_WS_API_KEY, symbols=symbol_convertion)

In [None]:
result = []
for text in texts:
    for corruption in corruptions[text]:
        text_corrupt = text.replace(corruption[0], corruption[1], 1)
        text_corrupt = text_corrupt.translate(str.maketrans(
            '', '', string.punctuation.replace('-', '').replace(',', '').replace('.', '')))
        
        result.append(teste_corretor.corrigir_texto(text_corrupt.split('.')[0]))

In [None]:
print(len(result))

In [None]:
total = len(result)
points = 0
for text, answer in zip(texts, result):
    print(text)
    print(answer)
    print(corruptions[text])
    
    if corruptions[text][0][1] not in answer.keys() or answer[corruptions[text][0][1]] is None:
        total -= 1
    elif corruptions[text][0][0] in answer[corruptions[text][0][1]]:
        points += 1

    print()
# print(corruptions.values())

In [None]:
print("Score:", points / total)
print("Total:", total)
print("Points:", points)