## K-MEANS CLUSTERING


In [9]:
import pandas as pd
from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")
df_ref

Unnamed: 0,article,abstract,article_length,abstract_length,top_words,similarity,similarity_percentage
0,direct central nervous system ( cns ) involvem...,a 25-year - old male patient presented to our ...,1394,293,"['of', 'and', 'the', 'in', 'with', 'a', 's', '...",0.955112,95.511218
1,primary nocturnal enuresis is intermittent noc...,several therapeutic options have been describe...,2014,247,"['the', 'in', 'of', 'and', 'desmopressin', 'to...",0.882804,88.280379
2,guillain - barr syndrome ( gbs ) is an immune ...,psychiatric symptoms in guillain - barr syndro...,4195,108,"['and', 'of', 'the', 'her', 'to', 'with', 'was...",0.877994,87.799449
3,progress in three - dimensional ( 3d ) echocar...,this review covers the role of three - dimensi...,4951,121,"['the', 'and', 'of', 'in', 'with', 'a', 'valve...",0.905861,90.586089
4,municipal solid waste workers ( mswws ) or ref...,background : solid waste management has emerge...,6167,375,"['and', 'the', 'of', 'to', 'waste', 'in', 'wor...",0.872562,87.256238
5,stroke is a clinical emergency that commonly r...,to assess the stroke workload of italian neuro...,7252,220,"['the', 'of', 'and', 'stroke', 'in', 'nu', 'fo...",0.748964,74.896378
6,chronic kidney disease ( ckd ) is one of the m...,glomerular filtration rate ( gfr ) is still a ...,4526,227,"['the', 'of', 'and', 'a', 'to', 'in', 'gfr', '...",0.866245,86.62448
7,"over the past decade , it has increasingly bec...",study backgroundpatients who experience a recu...,1344,268,"['the', 'of', 'a', 'patients', 'to', 'and', 'v...",0.936864,93.686421
8,prostate cancer is one of the most prevalent t...,"purposein clinical practice , atypical small a...",2279,279,"['the', 'in', 'of', 'cancer', 'and', 'prostate...",0.954418,95.441754
9,congenital cervical swelling constitutes a het...,an 18-year - old male patient presented with a...,841,146,"['the', 'of', 'in', 'was', 'a', 'and', 'cyst',...",0.926442,92.644246


In [11]:
# objeto Rouge para calcular la métrica ROUGE
rouge = Rouge()

In [12]:
# listas para almacenar los resultados de la métrica ROUGE
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

In [16]:
# Función para generar un resumen utilizando K-means Clustering
def generate_summary_kmeans(article, num_clusters):
    # Dividir el artículo en oraciones
    sentences = nltk.sent_tokenize(article)
    
    # Verificar si hay suficientes oraciones para el número de clusters especificado
    if num_clusters > len(sentences):
        return article  # Si no hay suficientes oraciones, simplemente devolvemos el artículo completo
    
    # Vectorizar las oraciones utilizando TF-IDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Aplicar K-means para clusterizar las oraciones
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    
    # Obtener los índices de las oraciones más cercanas a los centroides de los clusters
    centroid_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, X, metric='euclidean')[0]
    
    # Ordenar y seleccionar las oraciones más cercanas a los centroides como resumen
    summary_sentences = [sentences[i] for i in centroid_indices]
    summary = ' '.join(summary_sentences)
    
    return summary

In [18]:
# Iteración sobre cada fila del DataFrame
for index, row in df_ref.iterrows():
    # Obtener el artículo y el resumen original
    article = row['article']
    original_summary = row['abstract']
    
    # Generar el resumen utilizando K-means Clustering
    num_clusters = 3  # Número de clusters deseado
    generated_summary = generate_summary_kmeans(article, num_clusters)
    
    # Calcular ROUGE para el resumen generado y el resumen original
    scores = rouge.get_scores(generated_summary, original_summary)
    
    # Extraer los valores específicos de ROUGE
    rouge_1_scores.append(scores[0]['rouge-1']['f'])
    rouge_2_scores.append(scores[0]['rouge-2']['f'])
    rouge_l_scores.append(scores[0]['rouge-l']['f'])

In [19]:
# Crear un DataFrame para almacenar los resultados de ROUGE
rouge_scores_df = pd.DataFrame({
    'ROUGE-1': rouge_1_scores,
    'ROUGE-2': rouge_2_scores,
    'ROUGE-L': rouge_l_scores
})

In [21]:
rouge_scores_df

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
0,0.0,0.0,0.0
1,0.346369,0.103321,0.301676
2,0.239521,0.080357,0.179641
3,0.235294,0.050761,0.205882
4,0.0,0.0,0.0
5,0.251497,0.119718,0.233533
6,0.419048,0.208333,0.361905
7,0.253659,0.037152,0.234146
8,0.358696,0.164038,0.336957
9,0.313253,0.119048,0.26506


In [22]:
# Para posteriores cálculos se extrae el dataframe 'rouge_scores_KMEANS_df'
rouge_scores_df.to_csv('rouge_scores_KMEANS_df.csv', index=False)

In [20]:
average_rouge_scores = rouge_scores_df.mean()
print("Promedio de métricas ROUGE para todo el dataset:")
print(average_rouge_scores)

Promedio de métricas ROUGE para todo el dataset:
ROUGE-1    0.211707
ROUGE-2    0.068363
ROUGE-L    0.179567
dtype: float64
