# Procesamiento del Lenguaje Natural - Trabajo Práctico Final
### Alumno: Simón Revello

## Importar librerías

In [170]:
import wikipediaapi as wiki
import pandas as pd
import bs4
import requests as req
import chromadb
from chromadb.utils import embedding_functions
import spacy
from IPython.core.display import HTML
from IPython.display import display
import networkx as nx
import uuid
import spacy
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import arrow

### Obtener información - Hooks

In [171]:
hooksTypes = {
    "State": ["useState", "useReducer"],
    "Context": ["useContext"],
    "Ref": ["useRef", "useImperativeHandle"],
    "Effect": ["useEffect", "useLayoutEffect", "useInsertionEffect"],
    "Performance": ["useMemo", "useCallback", "useTransition", "useDeferredValue"],
    "Other": ["useDebugValue", "useId", "useSyncExternalStore"]
}

In [172]:
def get_info_hooks():
    data = []
    BASE_URL = "https://es.react.dev/reference/react/"
    for key, hooks in hooksTypes.items():
        for hook in hooks:
            id = str(uuid.uuid4())
            res = req.get(BASE_URL + hook)
            bs = bs4.BeautifulSoup(res.text, "html.parser")
            #Descripción
            descHtml = bs.find("p", {"class": "whitespace-pre-wrap my-4"})
            desc = descHtml.text

            #Referencia
            referenceTitleHtml = bs.find("h2", {"id": "reference"})
            subTitleHtml = referenceTitleHtml.find_next_sibling()
            referenceDescHtml = subTitleHtml.find_next_sibling()
            codeExampleHtml = referenceDescHtml.find_next_sibling()
            reference = referenceDescHtml.text + "\n" + codeExampleHtml.text

            #Retorno
            returnsTitleHtml = bs.find("h4", {"id": "returns"})
            nextSiblingHtml = returnsTitleHtml.find_next_sibling()
            returns = ""
            while(nextSiblingHtml != None and nextSiblingHtml.name != "h4"):
                returns += nextSiblingHtml.text + "\n"
                nextSiblingHtml = nextSiblingHtml.find_next_sibling()

            #Parámetros
            parametersTitleHtml = bs.find("h4", {"id": "parameters"})
            listParameters = parametersTitleHtml.find_next_sibling()
            parameters = listParameters.find_all(recursive=False)
            paramStr = ""
            for parameter in parameters:
                codeTag = parameter.find("code")
                if codeTag == None:
                    paramStr = "No acepta ningún parámetro"
                    break
                name = codeTag.text.strip()
                codeTag.clear()
                description = parameter.text
                description = description.replace(":", "").strip()
                paramStr = f"{name}:{description}"
            
            data.append({
                "id": id,
                "hook": hook,
                "desc": desc,
                "reference": reference,
                "parameters": paramStr,
                "returns": returns,
                "type": key
            })
    return pd.DataFrame(data).set_index("id")

### Obtener información - Funcionamiento general

In [173]:
def get_funcionamiento_react():
    data = []
    BASE_URL = "https://es.react.dev/learn"
    res = req.get(BASE_URL)
    bs = bs4.BeautifulSoup(res.text, "html.parser")
    #Componentes
    componentsTitleHtml = bs.find("h2", {"id": "components"})
    componentesDescHtml = componentsTitleHtml.find_next_sibling()
    componentesDescHtml2 = componentesDescHtml.find_next_sibling()
    componentsDesc = componentsTitleHtml.text + "\n" + componentesDescHtml.text + "\n" + componentesDescHtml2.text
    data.append({
        "id": str(uuid.uuid4()),
        "desc": componentsDesc,
        "tag": "Componente;UI;Interfaz;Grafica"
    })
    
    #Marcado JSX
    h2JsxHtml = bs.find("h2", {"id": "writing-markup-with-jsx"})
    jsxDescHtml = h2JsxHtml.find_next_sibling()
    jsxDescHtml2 = jsxDescHtml.find_next_sibling()
    jsxDesc = h2JsxHtml.text + "\n" + jsxDescHtml.text + "\n" + jsxDescHtml2.text
    data.append({
        "id": str(uuid.uuid4()),
        "desc": jsxDesc,
        "tag": "JSX Markup"
    })

    #Estilos CSS
    h2CssHtml = bs.find("h2", {"id": "adding-styles"})
    cssDescHtml = h2CssHtml.find_next_sibling()
    cssDescHtml2 = cssDescHtml.find_next_sibling().find_next_sibling().find_next_sibling()
    cssDesc = h2CssHtml.text + "\n" + cssDescHtml.text + "\n" + cssDescHtml2.text
    data.append({
        "id": str(uuid.uuid4()),
        "desc": cssDesc,
        "tag": "CSS Style Estilos Clases"
    })

    #Renderizado de listas
    h2ListasHtml = bs.find("h2", {"id": "rendering-lists"})
    listasDescHtml = h2ListasHtml.find_next_sibling()
    listasDescHtml2 = listasDescHtml.find_next_sibling()
    listasDescHtml3 = listasDescHtml2.find_next_sibling().find_next_sibling()
    listasDescHtml4 = listasDescHtml3.find_next_sibling().find_next_sibling()
    listasDesc = h2ListasHtml.text + "\n" + listasDescHtml.text + "\n" + listasDescHtml2.text + "\n" + listasDescHtml3.text + "\n" + listasDescHtml4.text
    data.append({
        "id": str(uuid.uuid4()),
        "desc": listasDesc,
        "tag": "Listas;Key;Arreglo"
    })
    
    return pd.DataFrame(data).set_index("id")

### Generación de Base de Datos

In [174]:
def process_txt(txt):
    nlp = spacy.load("es_core_news_md")
    doc = nlp(txt)
    return " ".join([str(token) for token in doc if not token.is_stop])

In [175]:
def get_db():
    return chromadb.PersistentClient(path="reactdb")
def get_coll(db):
    return db.get_or_create_collection("data", embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(model_name="hiiamsid/sentence_similarity_spanish_es"), metadata={"hnsw:space": "cosine"})

def crear_o_cargar_db(hooks, funcReact):
    data = []
    ids = []
    metas = []
    db = get_db()
    dataColl = get_coll(db)
    for i in range(len(hooks)):
        hookRow = hooks.iloc[i]
        id = hookRow.name
        dataStr = process_txt(hookRow["hook"]) + "\n" + process_txt(hookRow["desc"]) + process_txt(hookRow["reference"]) + process_txt(hookRow["returns"]) + process_txt(hookRow["type"])
        meta = {"hook": hookRow["type"]}
        data.append(dataStr)
        metas.append(meta)
        ids.append(id)
    for i in range(len(funcReact)):
        func = funcReact.iloc[i]
        id = func.name
        dataStr = process_txt(func["desc"])
        meta = {"tag": func["tag"]}
        data.append(dataStr)
        ids.append(id)
        metas.append(meta)
    dataColl.add(
        documents=data,
        metadatas=metas,
        ids=ids
    )

def query_db(question):
    res = get_coll(get_db()).query(
        query_texts=[question],
        n_results=2
    )
    ids = res["ids"][0]
    docsTxt = []
    for i, id in enumerate(ids):
        if id in hooksDf.index:
            docsTxt.append(hooksDf.loc[id, "hook"] + "\n" + hooksDf.loc[id, "desc"] + hooksDf.loc[id, "reference"] + hooksDf.loc[id, "returns"] + hooksDf.loc[id, "type"])
        elif id in funcReactDf.index:
            docsTxt.append(funcReactDf.loc[id, "desc"])
    res["docsTxt"] = docsTxt
    return res
    

## Base de datos de Grafos

In [176]:
def get_wiki_ids(question):
    nlp = spacy.load("es_core_news_md")
    API_ENDPOINT = "https://wikidata.org/w/api.php"
    doc = nlp(question)
    res_val = {}
    for token in doc:
        if not token.pos_ in ["NOUN", "PROPN"]: continue
        params = {
            "action": "wbsearchentities",
            "format": "json",
            "language": "en",
            "uselang": "es",
            "search": str(token)
        }
        res = req.get(API_ENDPOINT, params=params)
        data = res.json()
        for item in data["search"]:
            res_val[item["id"]] = item["description"]
    return res_val

def get_graph_data(id):
    db = SPARQLWrapper("https://query.wikidata.org/sparql", returnFormat="json")
    query = """
        SELECT ?wdLabel ?ooLabel
        WHERE {{
         VALUES (?s) {{(wd:{0})}}
         ?s ?wdt ?o .
         ?wd wikibase:directClaim ?wdt .
         ?wd rdfs:label ?wdLabel .
         OPTIONAL {{
         ?o rdfs:label ?oLabel .
         FILTER (lang(?oLabel) = "es")
         }}
         FILTER (lang(?wdLabel) = "es")
         BIND (COALESCE(?oLabel, ?o) AS ?ooLabel)
         }} ORDER BY xsd:integer(STRAFTER(STR(?wd), "http://www.wikidata.org/entity/P"))
        """.format(str(id))
    db.setQuery(query)
    data = db.query().convert()
    bindings = data["results"]["bindings"]
    final_data = ""
    for binding in bindings:
        wdLabel = binding["wdLabel"]["value"]
        ooLabel = binding["ooLabel"]["value"]
        if "datatype" in binding["ooLabel"] and binding["ooLabel"]["datatype"] == "http://www.w3.org/2001/XMLSchema#dateTime":
            date = arrow.get(ooLabel)
            ooLabel = date.format("YYYY-MM-DD")
        final_data += wdLabel + ":" + ooLabel + "\n"
    return final_data

def search_in_graph(question):
    ids = get_wiki_ids(question)
    id = list(ids.keys())[0]
    return get_graph_data(id)

In [177]:
hooksDf = get_info_hooks()
funcReactDf = get_funcionamiento_react()
crear_o_cargar_db(hooksDf, funcReactDf)

### Datos Tabulares

In [234]:
COLUMNS = ["type", "hook"]
def tab_template(pregunta):
    template = f"""
    <|system|>Eres un asistente útil que se encarga de extraer los atributos relacionados al prompt siguiendo las siguientes claves:
    hook: Nombre de la función
    type: Tipo de función
    Sólo responde lo que se te solicita. No uses información externa</s>
    <|user|> Extrae los atributos de este prompt: 'Qué retorna el hook useState?' </s>
    <|assistant|>-hook=useState-</s>
    <|user|> Extrae los atributos de este prompt: 'Qué parámetros requiere la función useEffect?' </s>
    <|assistant|>-hook=useEffect-</s>
    <|user|> Extrae los atributos de este prompt: 'useEffect es de tipo Ref?' </s>
    <|assistant|>-hook=useEffect,type=Ref-</s>
    <|user|> Extrae los atributos de este prompt: 'Tienes información sobre los hooks de tipo Ref?' </s>
    <|assistant|>-type=Ref-</s>
    <|user|> Extrae los atributos de este prompt: 'De qué tipo son useEffect y useState?' </s>
    <|assistant|>-hook=useEffect,hook=useState-</s>
    <|user|>Extrae los atributos de este prompt: 'Cuáles son los hooks que son de tipo State?' </s>
    <|assistant|>-type=State-</s>
    <|user|> Extrae los atributos de este prompt: '{pregunta}' </s>
    <|assistant|>"""
    return template

def extract_attr(template):
    res = query_hugging_face(template, {
        "max_new_tokens": 20,
        "top_p": 0.95,
        "temperature": 0.2
    })
    arrayRes = res.split("-")
    if len(arrayRes) == 1: return None
    keyValuePair = arrayRes[1].split(",")
    data = {}
    for pair in keyValuePair:
        key, value = pair.split("=")
        if key in data:
            data[key].append(value)
        else:
            data[key] = [value]
    return data

def get_data_df(filters):
    hDf = hooksDf.copy()
    data = []
    for key, values in filters.items():
        for value in values:
            serie = hDf[hDf[key] == value]
            for index, row in serie.iterrows():
                data.append(row)
                hDf.drop(index=index, inplace=True)
    dfData = pd.DataFrame(data=data, columns=hDf.columns)
    for dfCol in dfData.columns:
        if dfCol not in COLUMNS:
            dfData.drop(columns=dfCol, inplace=True)
    return dfData

def get_context_from_df(df):
    cols = COLUMNS
    data = ""
    for key, row in df.iterrows():
        for col in cols:
            data += col + ": " + row[col] + " - "
        data += "\n"
    return data

def query_in_df(prompt):
    template = tab_template(prompt)
    attrs = extract_attr(template)
    if attrs == None: return None
    data = get_data_df(attrs)
    return get_context_from_df(data)

### Conexión con Hugging Face

In [204]:
def create_template(contexto, pregunta):
    template = "<|system|>Eres un asistente útil que siempre responde con respuestas veraces, útiles y basadas en hechos. Sólo debes responder con la información que recibes en el contexto. Sólo responde una única pregunta. No debese mencionar que obtienes la información de un contexto. Si la información no está en el contexto o no está relacionada, responde: 'No tengo información suficiente para responder eso'. Sólo debes responder en Español</s>"
    template += f"<|user|>La información de contexto es la siguiente: {contexto}.\n"
    template += "Dada la información de contexto anterior y sin utilizar conocimiento previo, responde lo siguiente:"
    template += "Pregunta: " + pregunta + "</s>"
    template += "<|assistant|>"
    return template

In [183]:
def query_hugging_face(query, parameters = {
    "max_new_tokens": 500,
    "top_p": 0.95,
    "temperature": 0.2
}):
    API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
    headers = {
        "Authorization": "Bearer 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",
        "Content-type": "application/json"
    }
    data = {
        "inputs": query,
        "parameters": parameters
    }
    res = req.post(API_URL, headers=headers, json=data)
    dataJson = res.json()
    if("error" in dataJson): raise Exception('Ha ocurrido un error en el servidor: ' + dataJson['error'])
    return dataJson[0]["generated_text"][len(query):]

In [184]:
def create_class_template(question):
    template = """
    <|system|>
    Tu rol es el de una Inteligencia Artificial que clasifica el prompt en las siguientes categorías:
    1-Pregunta sobre el funcionamiento de ReactJS.
    2-Información sobre la historia de ReactJS.
    3-Pregunta sobre los atributos de hooks de ReactJS
    Sólo puedes responder '1' o '2'. No debes responder texto. </s>
    <|user|>
    +Cómo funciona useEffect? </s>
    <|assistant|>
    1 </s>
    <|user|>
    +Cómo crear un componente funcional? </s>
    <|assistant|>
    1 </s>
    <|user|>
    +Qué es un componente de clase? </s>
    <|assistant|>
    1 </s>
    <|user|>
    +Quién creó ReactJS? </s>
    <|assistant|>
    2 </s>
    <|user|>
    +ReactJS es de codigo abierto? </s>
    <|assistant|>
    2 </s>
    <|user|>
    +Qué retorna el hook useState? </s>
    <|assistant|>
    3 </s>
    <|user|>
    +Qué tipo de hook es useEffect? </s>
    <|assistant|>
    3 </s>
    <|user|>
    +Qué tipo de hook es useContext? </s>
    <|assistant|>
    3 </s>
    <|user|>
    Categoriza lo siguiente:
    """ + f"'{question}' </s> <|assistant|>"
    return template

def classify_question(question):
    template = create_class_template(question)
    res = query_hugging_face(template, {
        "max_new_tokens": 5,
        "top_p": 0.95,
        "temperature": 0.1
    })
    try:
        clasificacion = re.sub(r'[^0-9]', '', res)
        return int(clasificacion)
    except:
        return 1

In [236]:
def ask(question):
    type = classify_question(question)
    contexto = None
    if type == 1:
        contexto = query_db(question)
    elif type == 2:
        contexto = search_in_graph(question)
    elif type == 3:
        contexto = query_in_df(question)
    if contexto == None: raise Exception("No se ha podido generar una respuesta. Prueba haciendo la pregunta de una forma diferente.")
    template = create_template(contexto, question)
    return query_hugging_face(template)

## EJEMPLOS

### Base de datos vectorial

In [207]:
print(ask("Cómo funciona useState?"))


Respuesta: El Hook de React llamado useState es una función que permite agregar una variable de estado a tu componente. Al utilizar este Hook, se puede declarar una variable y su valor inicial, y luego se puede actualizar este valor en respuesta a eventos o acciones en el componente. La variable de estado se actualiza utilizando una nueva función que se devuelve al llamar a useState, y esta función se puede llamar para actualizar el valor de la variable de estado.


In [208]:
print(ask("Qué es un componente funcional?"))


Respuesta: Un componente funcional en React es una función JavaScript que devuelve markup (marcado) como resultado de su ejecución. No se define como un objeto clase, sino como una función pura que toma propiedades como entrada y devuelve el markup correspondiente. Los componentes funcionales se utilizan para crear pequeños pedazos de interfaz de usuario reutilizables, que se pueden anidar y combinar para crear aplicaciones más complejas.


### Base de datos de grafos (Wikipedia)

In [209]:
print(ask("Quién creó ReactJS? En qué fecha?"))


Respuesta: ReactJS fue creado por Meta Platforms (anteriormente conocida como Facebook) en el año 2013, con una fecha de creación específica del 29 de mayo de ese año.


In [211]:
print(ask("ReactJS es de uso libre?"))


Sí, ReactJS está bajo la licencia MIT, lo que significa que es de uso libre.


### Datos tabulares (DataFrame)

In [238]:
print(ask("Qué hooks son de tipo Ref?"))


Respuesta: Los hooks de tipo Ref son `useRef` y `useImperativeHandle`. Se pueden identificar por el tipo `Ref` que aparece en la documentación de los hooks.


In [237]:
print(ask("De qué tipo son useEffect y useState?"))


Respuesta: UseEffect y useState son dos hooks de React, más específicamente son efectos y estados respectivamente, que se utilizan dentro de componentes funcionales para agregar efectos secundarios y mantener el estado local, respectivamente.
