# Solução simbólica

## Preparação do dicionário

In [None]:
import csv
from typing import TypedDict
from tqdm import tqdm

In [None]:
class WordData(TypedDict):
    raiz: set[str]
    grammar_class: set[str]
    morphemes: set[str]


class Node():
    def __init__(self, head: str, data: dict | None = None, prox: dict[str, 'Node'] | None = None):
        self.head = head
        self.data = data
        self.prox = {} if prox is None else prox


class ReGra():
    def __init__(self):
        self.nodes: dict[str, Node] = {}

    def dive(self, current_node: Node, word: str, depth: int):
        # Palavra não existe ou palavra encontrada
        if depth >= len(word) or word[depth] not in current_node.prox.keys():
            return current_node

        # Continua a recursão
        return self.dive(current_node.prox[word[depth]], word, depth + 1)

    def get_parent(self, word: str) -> Node | None:
        if word[0] not in self.nodes.keys():
            return None

        return self.dive(self.nodes[word[0]], word, 1)

    def __getitem__(self, word: str):
        if word[0] not in self.nodes.keys():
            return None

        node = self.dive(self.nodes[word[0]], word, 1)

        if node.head != word:
            return None

        return node.data

    def __setitem__(self, idx: str, value: WordData):
        # #print()
        # print()
        parent = self.get_parent(idx)

        # Se o conjunto está vazio ou uma letra base não existe
        if parent is None:
            if len(idx) == 1:
                self.nodes[idx] = Node(head=idx, data=value)
                return

            parent = Node(head=idx[0])
            self.nodes[idx[0]] = parent

            for i in range(1, len(idx) - 1):
                parent.prox[idx[i]] = Node(head=idx[:i + 1])
                parent = parent.prox[idx[i]]

            parent.prox[idx[-1]] = Node(head=idx, data=value)
            return

        # Se a palavra existe, atualiza os dados
        if parent.head == idx:
            if parent.data is None:
                parent.data = value

            else:
                parent.data['raiz'] = parent.data['raiz'].union(value['raiz'])
                parent.data['grammar_class'] = parent.data['grammar_class'].union(
                    value['grammar_class'])
                parent.data['morphemes'] = parent.data['morphemes'].union(
                    value['morphemes'])

            return

        # Se a palavra não existe, mas é filha de uma que existe
        # cria o nó dela
        if len(parent.head) + 1 == len(idx):
            parent.prox[idx[-1]] = Node(head=idx, data=value)

        # Se a palavra não existe, e não é filha de uma que existe
        # cria um caminho de nós até ela
        else:
            for i in range(len(parent.head), len(idx) - 1):
                parent.prox[idx[i]] = Node(head=idx[:i + 1])
                parent = parent.prox[idx[i]]

            parent.prox[idx[-1]] = Node(head=idx, data=value)

    def __repr__(self) -> str:
        string = ''
        for node in self.nodes.values():
            string += self.build_tree(node)

        return string

    def build_tree(self, current: Node, depth: int = 0):
        string = f"{'-' * depth} {current.head}{' OK' if current.data is not None else ''}\n"

        for node in current.prox.values():
            string += self.build_tree(node, depth + 1)

        return string

    def get_children(self, current: Node, max_depth: int = 10, depth: int = 0) -> list[str]:
        if depth >= max_depth:
            return []

        children = []

        for node in current.prox.values():
            if node.data is not None:
                children.append(node.head)

            children += self.get_children(node, max_depth=max_depth, depth=depth + 1)

        return children

In [None]:
dicionary_path = "./portilexicon-ud.tsv"
dictionary = ReGra()

with open(dicionary_path) as dic:
    rd = csv.reader(dic, delimiter='\t', quotechar='"')
    qtd_lines = sum(1 for _ in rd)
    dic.seek(0)    
    print('teste')
    
    progress = tqdm(total=qtd_lines, desc='Carregando dicionário')
    
    for row in rd:
        word = row[0]
        raiz = [row[1]]
        grammar_class = [row[2]]
        morphemes = [x for x in row[3].split('|') if x]

        dictionary[word] = WordData(raiz=set(raiz), grammar_class=set(grammar_class), morphemes=set(morphemes))
        
        progress.update(1)
        
    progress.close()
        

## Parsing da Frase

In [None]:
def clean_text(text: str):
    return text.lower().replace(',', '').replace(
        '.', '').replace('!', '').replace('?', '').replace(';', '')

In [None]:
def parsing(phrase: str, dictionary: ReGra):
    phrase = clean_text(phrase)
    
    out = {}
    for word in phrase.split():
        out[word] = dictionary[word]
        
        if out[word] is None:
            out[word] = {'raiz': set(['Unknown']), 'grammar_class': set(['Unknown']), 'morphemes': set(['Unknown'])}
            
    return out

## Geração da Árvore Sintática

A árvore sintática é gerada através do LX-Parser. Para obter um chave da API siga as instruções em:

[https://portulanclarin.net/workbench/lx-parser/](https://portulanclarin.net/workbench/lx-parser/)

In [None]:
import requests

In [None]:
LXPARSER_WS_API_URL = 'https://portulanclarin.net/workbench/lx-parser/api/'

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

In [None]:
# Função fornecida pelos desenvolvedores do LX-Parser

class WSException(Exception):
    'Webservice Exception'
    def __init__(self, errordata):
        "errordata is a dict returned by the webservice with details about the error"
        super().__init__(self)
        assert isinstance(errordata, dict)
        self.message = errordata["message"]
        # see https://json-rpc.readthedocs.io/en/latest/exceptions.html for more info
        # about JSON-RPC error codes
        if -32099 <= errordata["code"] <= -32000:  # Server Error
            if errordata["data"]["type"] == "WebServiceException":
                self.message += f": {errordata['data']['message']}"
            else:
                self.message += f": {errordata['data']!r}"
    def __str__(self):
        return self.message

In [None]:
# Função fornecida pelos desenvolvedores do LX-Parser

def parse(text, format):
    '''
    Arguments
        text: a string with a maximum of 2000 characters, Portuguese text, with
             the input to be processed
        format: either 'parentheses', 'table' or 'JSON'

    Returns a string or JSON object with the output according to specification in
       https://portulanclarin.net/workbench/lx-parser/
    
    Raises a WSException if an error occurs.
    '''

    request_data = {
        'method': 'parse',
        'jsonrpc': '2.0',
        'id': 0,
        'params': {
            'text': text,
            'format': format,
            'key': LXPARSER_WS_API_KEY,
        },
    }
    request = requests.post(LXPARSER_WS_API_URL, json=request_data)
    response_data = request.json()
    if "error" in response_data:
        raise WSException(response_data["error"])
    else:
        return response_data["result"]

In [None]:
frase_teste = "O que você quer dar a ela, Pedro? dasdasd"

In [None]:
result_table = parse(frase_teste, format="table")
print(result_table)

### Alinhando o dicionário e o LX-Parser

Como eles utilizam símbolos diferentes, é necessário alinhar as classes antes de prosseguir

In [None]:
symbol_corvertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

In [None]:
result = result_table.split('\n')

predicted = []
for r in result:
    r = r.split('\t')
    r[1] = r[1].replace(')', '').split('(')[-1].replace('*', '')
    
    if len(r) > 1 and r[1] != 'PNT':
        predicted.append((r[0], symbol_corvertion[r[1]]))

print(predicted)

In [None]:
parsed = [[x[0], x[1]['grammar_class']] for x in parsing(frase_teste, dictionary).items()]

In [None]:
def aligning(parsed, predicted):
    if len(parsed) != len(predicted):
        print(f"Lengths don't match: {len(parsed)} != {len(predicted)}")
        return []

    aligned_classes = []

    for i in range(len(parsed)):
        if parsed[i][0] != predicted[i][0].lower():
            print(f"Words don't match: {parsed[i][0]} != {predicted[i][0]}")
            return []
        
        word = predicted[i][0]
        classes = parsed[i][1].intersection(set(predicted[i][1]))
        
        
        if len(classes) == 0:
            # Para o caso de ser um nome
            if 'Unknown' in parsed[i][1] and 'NOUN' in predicted[i][1]:
                classes = set(predicted[i][1])
            
            else:
                classes = set(['ERROR', predicted[i][1][0]])

        aligned_classes.append([word, classes])

    return aligned_classes

In [None]:
aligned_tree = aligning(parsed, predicted)

In [None]:
for t in aligned_tree: print(t)

## Sugere as mudanças

### Correção gramatical

Calcula a diferença entre a 

In [None]:
def levenshtein_distance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)

    matrix_len = max(len1, len2) + 1
    matrix = [[0 for _ in range(matrix_len)] for _ in range(matrix_len)]

    for i in range(matrix_len):
        matrix[i][0] = i
        matrix[0][i] = i

    for i in range(1, matrix_len):
        for j in range(1, matrix_len):
            matrix[i][j] = min(matrix[i - 1][j], matrix[i][j - 1], matrix[i - 1][j - 1])
            
            if i > len1:
                matrix[i][j] += 1
            elif j > len2:
                matrix[i][j] += 1
            elif word1[i - 1] != word2[j - 1]:
                matrix[i][j] += 1

    return matrix[matrix_len - 1][matrix_len - 1]

In [None]:
def get_corrections(word, grammar_class, dictionary):
    word = clean_text(word)
    p = dictionary.get_parent(word)
    
    print(grammar_class)

    # Procura paralavras com 2 até dois caracteres a mais ou a menos
    if len(p.head) > 2:
        p = dictionary.get_parent(p.head[:-2])

    children = dictionary.get_children(p, max_depth=4.0)

    corrections = []
    for c in children:
        if grammar_class in dictionary[c]['grammar_class']:
            corrections.append(c)

    corrections.sort(key=lambda x: levenshtein_distance(word, x))
    return corrections[:10]

In [None]:
possible_corrections = {}

for w in aligned_tree:
    if 'ERROR' in w[1]:
        possible_corrections[w[0]] = get_corrections(w[0], list(w[1] - {'ERROR'})[0], dictionary)

## Define uma classe para correção

In [None]:
from corretor import Corretor

In [None]:
dicionary_path = "./portilexicon-ud.tsv"

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

symbol_corvertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

In [None]:
teste_corretor = Corretor(dicionary_path)
teste_corretor.setup_key(LXPARSER_WS_API_KEY)
teste_corretor.setup_symbols(symbol_corvertion)

In [None]:
teste_corretor.corrigir_texto("Um cachorro quente bem fresquinhu")

## Utilização de LLM + Marcação para reescrever mensagens

In [None]:
from llama_cpp import Llama

llm = Llama(
    model_path='./sabia-7b.Q4_0.gguf',
    seed=42,
    chat_format="llama-2",
)

In [None]:
def rewrite(question: str, context: str | None, full_output=False, seed=42):
    context = [
        'Você um excelente assistente que reescreve textos segundo bons padrões. Abaixo estão os deve reescrever com mais seriedade:',
        'Q: Onde que a gente pode comprar ingressos pro show?',
        'A: Onde podemos comprar ingressos para o show?',
        'Q: Por que que o céu é azul?',
        'A: Por que o céu é azul?',
    ] if context is None else context

    context.append('Q: ' + question)
    context.append('A:')

    input_text = '\n'.join(context)

    # É estimado que a razão de tokens por palavra seja 0.75, 
    # então usa um número maior por segurança
    # max_tokens = len(question.split()) * 3 + 1

    output = llm(
        input_text,
        max_tokens=32,
        stop=["Q:"],
        # seed=seed,
        echo=True,
    )

    return output['choices'][0]['text'].split(':')[-1] if not full_output else output

In [None]:
output = rewrite("Cachorro quente fresquinhão", None)

In [None]:
print(output)

In [None]:
from corretor import Corretor

dicionary_path = "./portilexicon-ud.tsv"

LXPARSER_WS_API_KEY = ''
with open('.lxparser-key', 'r') as f: LXPARSER_WS_API_KEY = f.read()

symbol_corvertion = {
    'A': ['ADJ'],
    'ART': ['DET', 'PRON'],
    'N': ['NOUN'],
    'QNT': ['PRON'],
    'ADV': ['ADV'],
    'P': ['PRON'],
    'REL': ['CCONJ', 'SCONJ'],
    'V': ['VERB'],
    'PRS': ['PRON'],
}

corretor = Corretor(dicionary_path, LXPARSER_WS_API_KEY, symbol_corvertion)

In [None]:
test_message = 'Cachorro quente fresquinhu aqui'

out = corretor.corrigir_texto(test_message)

rew = rewrite(test_message, None)

out2 = corretor.corrigir_texto(rew)

In [None]:
print(out)
print()
print(rew)
print()
print(out2)