En este cuaderno filtraremos las respuestas ya generadas y procesadas.

### 1) Filtramos los pares que no estén en el contexto deseado, esto lo hacemos con un modelo de clasificación que nos proporciona Jónathan Heras

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import pipeline

classifier = pipeline(task="text-classification", model="joheras/bertin-roberta-base-spanish-spanish-suicide-intent", top_k=None)

sentences = ["¿Cuál es la capital de Francia? La capital de Francia es París"]

model_outputs = classifier(sentences)
print(model_outputs[0])

Downloading (…)lve/main/config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

[{'label': 'LABEL_0', 'score': 0.9999986886978149}, {'label': 'LABEL_1', 'score': 1.287733198296337e-06}, {'label': 'LABEL_3', 'score': 1.4278081827967526e-08}, {'label': 'LABEL_2', 'score': 1.3204688897872074e-08}]


In [16]:
sentences = ["¿Qué es el suicidio? Es el acto de quitarse la vida voluntariamente."]

model_outputs = classifier(sentences)
print(model_outputs[0])

[{'label': 'LABEL_1', 'score': 0.9999984502792358}, {'label': 'LABEL_0', 'score': 1.5477806982744369e-06}, {'label': 'LABEL_3', 'score': 3.3319256687036614e-08}, {'label': 'LABEL_2', 'score': 3.128896608473042e-08}]


In [20]:
sentences = ["Me quiero suicidar, ¿cómo puedo hacerlo? Si estás pensando en suicidarte no seas gaznápido y ve a buscar ayuda profesional."]

model_outputs = classifier(sentences)
print(model_outputs[0])

[{'label': 'LABEL_1', 'score': 0.9999980926513672}, {'label': 'LABEL_0', 'score': 1.863220973064017e-06}, {'label': 'LABEL_3', 'score': 3.1781910436734506e-09}, {'label': 'LABEL_2', 'score': 2.8164839349642534e-09}]


Creamos las funciones necesarias para realizar el filtrado

In [69]:
from transformers import pipeline
import pandas as pd

def juntar_pares(corpus, columnas):
    datos = corpus[columnas]
    pares = []
    for index, row in datos.iterrows():
        # Juntamos en una sola cadena la pregunta y la respuesta
        par = []
        for elem in row:
            par.append(elem)
        pares.append((index, par))
    
    return pares

def es_suicidio(par, classifier):
    frase = str(par[0]) + " " + str(par[1])
    model_outputs = classifier(frase)[0]
    return max(model_outputs, key=lambda x:x['score'])["label"]

def cargar_modelo_clasificacion(nombre):
    return pipeline(task="text-classification", model=nombre, top_k=None, max_length=512, truncation=True)

def filtrar_contexto(pares):
    a_eliminar = []
    classifier = cargar_modelo_clasificacion("joheras/bertin-roberta-base-spanish-spanish-suicide-intent")
    
    for index, par in pares:
        print(f"Fase {index} de {len(pares)}", end="\r")
        if es_suicidio(par, classifier) == 'LABEL_0':
            a_eliminar.append(index)

    return a_eliminar

In [35]:
es_suicidio(("¿Qué es el suicidio?", "Es el acto de quitarse la vida voluntariamente."), classifier)

'LABEL_1'

In [98]:
corpus = pd.read_csv("Corpus.csv", lineterminator='\n')

eliminar = filtrar_contexto(juntar_pares(corpus, ["Pregunta", "Respuesta"]))

Fase 13358 de 13359

In [99]:
corpus.drop(corpus.index[eliminar], inplace=True)

In [100]:
corpus.shape

(5219, 5)

In [101]:
corpus.to_csv("Corpus.csv", sep=',', index=False)

Terminamos este filtrado eliminando respuestas preguntas que no tengan más de 10 caracteres.

In [104]:
def filtrar_preguntas_muy_cortas(corpus):
    a_eliminar = []
    
    for index, elem in corpus.iterrows():
        print(f"Fase {index} de {corpus.shape[0]}", end="\r")
        if len(elem["Respuesta"]) < 10:
            a_eliminar.append(index)

    return a_eliminar

In [106]:
corpus = corpus.reset_index(drop=True)

In [107]:
a_eliminar = filtrar_preguntas_muy_cortas(corpus)

Fase 5218 de 5219

In [108]:
corpus.drop(corpus.index[a_eliminar], inplace=True)

In [109]:
corpus.shape

(4903, 5)

In [110]:
corpus.to_csv("Corpus.csv", sep=',', index=False)

### 2) Eliminamos los pares similares, no las preguntas o respuestas individualmente sino los pares en su conjunto.

In [3]:
import pandas as pd
from sentence_transformers import util
from evaluate import load
import numpy as np
from sentence_transformers import util
import spacy

def juntar_pares(corpus, columnas):
    datos = corpus[columnas]
    pares = []
    for index, row in datos.iterrows():
        # Juntamos en una sola cadena la pregunta y la respuesta
        par = []
        for elem in row:
            par.append(elem)
        pares.append((index, par))
    
    return pares

def calcular_similitud(par1, par2, modelo):
    # Utilizamos el modelo bertscore para calcular la similitud de dos frases
    return modelo.compute(predictions=par1, references=par2, lang="es")["f1"]

def lista_similitud(referencia, lista, modelo, fase):
    resultado = []
    for i, x in enumerate(lista):
        print(f"Fase {i} de {len(lista)} parte {fase}", end="\r")
        resultado.append((x[0],np.mean(calcular_similitud(referencia, x[1], modelo))))
    return resultado

def calcular_similitud_spacy(par1, par2, modelo):
    doc1 = modelo(' '.join(par1))
    doc2 = modelo(' '.join(par2))
    return doc1.similarity(doc2)

def diferencia_coseno(par1, par2):
    return util.pytorch_cos_sim(prueba, prueba)[0][0].item()

def filtrar_similares(pares):
    a_eliminar = []
    modelo = load("bertscore", device="cuda:0") # Usando bert score
    #modelo = spacy.load("es_core_news_sm") # Usando spacy
    restantes = pares.copy()
    
    for index, par in pares:
        if index not in a_eliminar and len(restantes)>1:
            distancias = lista_similitud(par, restantes, modelo, index)
            #distancias = [np.mean(calcular_similitud(par, x[0], modelo)) for x in restantes]
            #distancias = [np.mean(calcular_similitud_spacy(par, x[0], modelo)) for x in restantes]
            for i, elem in distancias:
                if index != i and elem > 0.8:
                    a_eliminar.append(i)
                    restantes = [x for x in restantes if x[0] != i]
            restantes = [x for x in restantes if x[0] != index]
            with open("Checkpoint.txt", "w") as output:
                output.write(str(a_eliminar))
    return a_eliminar

Pruebas

In [17]:
from sentence_transformers import SentenceTransformer

pruebas = pd.DataFrame([["Hola me llaman Palbito", "y Ascorbe"], ["Hola me llamo pablo", "Ascorbe"], ["Hasta luego no me llamo", "de ningún modo"]], columns=['A', 'B'])

pares = juntar_pares(pruebas, ["A", "B"])

In [18]:
pares

[(0, ['Hola me llaman Palbito', 'y Ascorbe']),
 (1, ['Hola me llamo pablo', 'Ascorbe']),
 (2, ['Hasta luego no me llamo', 'de ningún modo'])]

In [36]:
%%timeit
filtrar_similares(pares)

6.19 s ± 195 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Calculamos sobre el corpus

In [None]:
corpus = pd.read_csv("Corpus.csv", lineterminator='\n')

eliminar = filtrar_similares(juntar_pares(corpus, ["Pregunta", "Respuesta"]))
eliminar

Fase 4602 de 18322 parte 887

In [None]:
with open("Checkpoint.txt", 'w') as file:
    file.write(eliminar)

In [9]:
from pathlib import Path
import json

eliminar = Path("Checkpoint.txt").read_text()
eliminar = json.loads(eliminar)

In [11]:
corpus.shape

(22920, 5)

In [12]:
corpus.drop(corpus.index[eliminar], inplace=True)

In [13]:
corpus.shape

(13359, 5)

In [14]:
corpus.to_csv("Corpus.csv", sep=',', index=False)

### 3) Filtramos los resultados en base al contexto, para ello, aplicamos clústerización con DBSCAN que está pensada para encontrar espurios.

In [78]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.io as pio
from sentence_transformers import SentenceTransformer

def cargar_modelo_embedding(nombre):
     return SentenceTransformer(nombre)
    
def cargar_corpus(ruta):
    return pd.read_csv(ruta, lineterminator='\n')

def calcular_embeddings(corpus, modelo, columnas=None):
    if columnas:
        datos = corpus[columnas]
    else:
        datos = corpus
    
    embs = []
    for index, row in datos.iterrows():
        print(f"Fase {index} de {datos.shape[0]}", end="\r")
        # Juntamos en una sola cadena todos los campos
        cad = []
        for elem in row:
            cad.append(elem)
        emb = modelo.encode(cad)[0]
        embs.append(emb)
    
    return embs

def encontrar_clusteres(embs, metrica = "cosine", eps=0.5):
    # Calculamos la clasterización
    return DBSCAN(metric=metrica, eps=eps).fit(embs)
    
def proyectar_puntos(embs, dimension=2):
    # Proyectamos nuestros embeddings en un espacio de menor dimensión para poder visualizarlos
    return PCA(n_components=dimension).fit_transform(embs)

def mostrar_espurios(texto, clusteres, proyeccion):
    # Creamos un dataframe auxiliar con los datos a proyectar
    aux = pd.DataFrame()
    aux["y"] = clusteres.labels_
    aux["comp-1"] = proyeccion[:,0]
    aux["comp-2"] = proyeccion[:,1]
    aux["text"] = texto
    
    # Generamos el gráfico
    pio.renderers.default = 'iframe'
    fig = px.scatter(
        aux, x='comp-1', y='comp-2',
        color='y',
        hover_data='text')
    fig.show()

def limpiar_espurios(corpus, clusteres, grupo):
    # El grupo es el que queremos mantener, será el grupo principal y el resto los eliminaremos
    indices_elim = clusteres.labels_ == grupo
    corpus = corpus.loc[indices_elim]
    return corpus.reset_index(drop=True)

def limpiar_grupo(corpus, clusteres, grupo):
    # El grupo es el que queremos eliminar ya que son preguntas redundantes y hay muchas
    indices_elim = clusteres.labels_ != grupo
    corpus = corpus.loc[indices_elim]
    return corpus.reset_index(drop=True)

In [2]:
corpus = cargar_corpus("Corpus preguntas.csv")
embs = calcular_embeddings(corpus, cargar_modelo_embedding('hiiamsid/sentence_similarity_spanish_es'))

Fase 22919 de 22920

In [51]:
df_emb = pd.DataFrame(embs)
df_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-1.408472,-0.284516,-0.988199,-0.991739,-0.688388,-0.583975,-0.010682,-0.782642,0.8152,0.058693,...,-0.307022,-0.671821,0.512422,-0.173581,0.121643,-1.160892,0.168453,-0.235679,0.560121,-0.002953
1,-1.06976,-0.659508,-0.800598,-1.181654,-0.515965,-0.837448,0.418703,-1.308217,0.126495,-0.394154,...,0.9363,-0.620395,0.337323,0.235471,1.315079,-1.661706,-0.706733,0.136573,0.543989,0.514949
2,-0.333933,-0.150734,0.168753,-0.742309,-0.459517,-0.382396,0.058969,-0.195954,0.487164,-0.474504,...,1.074201,-1.140938,1.145919,-1.200268,0.073237,0.355085,0.005564,0.110881,1.224088,-0.557225
3,-0.691843,-0.970027,0.987329,-0.628927,-1.122727,0.084831,0.063809,0.829716,0.313569,-0.135334,...,-0.034081,-0.241485,0.530372,-0.72319,-0.512412,0.532074,0.150592,0.262125,0.909787,-0.612894
4,-0.17296,0.744163,0.876301,-0.204251,0.195257,-0.310812,0.091865,0.260345,-0.705332,0.39724,...,-0.102905,-0.075649,0.283769,0.016766,0.104198,0.419002,0.004514,-1.726378,0.560395,0.177829


In [71]:
clusteres = encontrar_clusteres(df_emb, eps=0.1)

In [69]:
proyeccion = proyectar_puntos(df_emb)

In [72]:
mostrar_espurios(corpus["Pregunta"], clusteres, proyeccion)

In [73]:
df_emb['Pregunta'] = corpus['Pregunta']
df_emb['Grupo'] = clusteres.labels_
df_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,Pregunta,Grupo
0,-1.408472,-0.284516,-0.988199,-0.991739,-0.688388,-0.583975,-0.010682,-0.782642,0.8152,0.058693,...,0.512422,-0.173581,0.121643,-1.160892,0.168453,-0.235679,0.560121,-0.002953,¿Cuál es el propósito de la protección al menor?,-1
1,-1.06976,-0.659508,-0.800598,-1.181654,-0.515965,-0.837448,0.418703,-1.308217,0.126495,-0.394154,...,0.337323,0.235471,1.315079,-1.661706,-0.706733,0.136573,0.543989,0.514949,¿Qué medidas puede tomar el médico para ayudar...,-1
2,-0.333933,-0.150734,0.168753,-0.742309,-0.459517,-0.382396,0.058969,-0.195954,0.487164,-0.474504,...,1.145919,-1.200268,0.073237,0.355085,0.005564,0.110881,1.224088,-0.557225,¿Cuál es el propósito de la cláusula de concie...,-1
3,-0.691843,-0.970027,0.987329,-0.628927,-1.122727,0.084831,0.063809,0.829716,0.313569,-0.135334,...,0.530372,-0.72319,-0.512412,0.532074,0.150592,0.262125,0.909787,-0.612894,¿Cuál es el derecho a la intimidad?,-1
4,-0.17296,0.744163,0.876301,-0.204251,0.195257,-0.310812,0.091865,0.260345,-0.705332,0.39724,...,0.283769,0.016766,0.104198,0.419002,0.004514,-1.726378,0.560395,0.177829,¿Cuál es la Constitución de España?,-1


In [74]:
import numpy as np

unique, counts = np.unique(clusteres.labels_, return_counts=True)
unique

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17

In [75]:
moda = list(clusteres.labels_)
max(set(moda), key=moda.count)

-1

In [76]:
corpus_principal = limpiar_espurios(corpus, clusteres, -1)

In [79]:
corpus_espurios = limpiar_grupo(corpus, clusteres, -1)

In [80]:
corpus_principal.to_csv("Corpus Preguntas Limpio.csv", sep=',', index=False)
corpus_espurios.to_csv("Corpus Preguntas Espurios.csv", sep=',', index=False)

In [81]:
corpus_principal.shape

(18697, 1)

In [82]:
corpus_espurios.shape

(4223, 1)