## TextRank

En este apartado de código se usará la técnica TextRank para generar el resumen y posteriormente comparar la calidad de extracción en contraste con el resumen original.

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge import Rouge

In [3]:
# Recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [7]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: 
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [10]:
# Función para generar el resumen
def generate_summary(article, num_sentences):
    stop_words = stopwords.words('english') 
    summarize_text = []

    sentences = sent_tokenize(article)

    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

    scores = np.array([sentence_similarity_matrix.sum(axis=1)]).T
    scores = scores / scores.sum(axis=0)

    ranked_sentence_indexes = [item[0] for item in sorted(enumerate(scores), key=lambda item: -item[1])]

    for i in range(num_sentences):
        summarize_text.append(sentences[ranked_sentence_indexes[i]])
        
    summary = ' '.join(summarize_text)
    return summary

#### Cálculo de la métrica ROUGE

In [11]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [12]:
# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")
df_ref

Unnamed: 0,article,abstract,article_length,abstract_length,top_words,similarity,similarity_percentage
0,direct central nervous system ( cns ) involvem...,a 25-year - old male patient presented to our ...,1394,293,"['of', 'and', 'the', 'in', 'with', 'a', 's', '...",0.955112,95.511218
1,primary nocturnal enuresis is intermittent noc...,several therapeutic options have been describe...,2014,247,"['the', 'in', 'of', 'and', 'desmopressin', 'to...",0.882804,88.280379
2,guillain - barr syndrome ( gbs ) is an immune ...,psychiatric symptoms in guillain - barr syndro...,4195,108,"['and', 'of', 'the', 'her', 'to', 'with', 'was...",0.877994,87.799449
3,progress in three - dimensional ( 3d ) echocar...,this review covers the role of three - dimensi...,4951,121,"['the', 'and', 'of', 'in', 'with', 'a', 'valve...",0.905861,90.586089
4,municipal solid waste workers ( mswws ) or ref...,background : solid waste management has emerge...,6167,375,"['and', 'the', 'of', 'to', 'waste', 'in', 'wor...",0.872562,87.256238
5,stroke is a clinical emergency that commonly r...,to assess the stroke workload of italian neuro...,7252,220,"['the', 'of', 'and', 'stroke', 'in', 'nu', 'fo...",0.748964,74.896378
6,chronic kidney disease ( ckd ) is one of the m...,glomerular filtration rate ( gfr ) is still a ...,4526,227,"['the', 'of', 'and', 'a', 'to', 'in', 'gfr', '...",0.866245,86.62448
7,"over the past decade , it has increasingly bec...",study backgroundpatients who experience a recu...,1344,268,"['the', 'of', 'a', 'patients', 'to', 'and', 'v...",0.936864,93.686421
8,prostate cancer is one of the most prevalent t...,"purposein clinical practice , atypical small a...",2279,279,"['the', 'in', 'of', 'cancer', 'and', 'prostate...",0.954418,95.441754
9,congenital cervical swelling constitutes a het...,an 18-year - old male patient presented with a...,841,146,"['the', 'of', 'in', 'was', 'a', 'and', 'cyst',...",0.926442,92.644246


In [23]:
# Se crea un objeto Rouge para calcular la métrica ROUGE
rouge = Rouge()

In [24]:
# Listas para almacenar los resultados de la métrica ROUGE
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

In [25]:
# Se itera sobre cada fila del DataFrame
for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    generated_summary = generate_summary(article, num_sentences=3)  
    
    scores = rouge.get_scores(generated_summary, original_summary)
    
    rouge_1_scores.append(scores[0]['rouge-1']['f'])
    rouge_2_scores.append(scores[0]['rouge-2']['f'])
    rouge_l_scores.append(scores[0]['rouge-l']['f'])

In [26]:
rouge_scores_df = pd.DataFrame({
    'ROUGE-1': rouge_1_scores,
    'ROUGE-2': rouge_2_scores,
    'ROUGE-L': rouge_l_scores
})

In [28]:
rouge_scores_df

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
0,0.33463,0.102041,0.280156
1,0.269663,0.035211,0.258427
2,0.330935,0.088398,0.258993
3,0.240601,0.011429,0.195489
4,0.212454,0.04157,0.197802
5,0.301205,0.080586,0.289157
6,0.269058,0.052786,0.215247
7,0.338308,0.062893,0.288557
8,0.341969,0.123894,0.290155
9,0.294416,0.107023,0.243655


In [30]:
# Para posteriores cálculos se extrae el dataframe 'rouge_scores_TextRank_df'
rouge_scores_df.to_csv('rouge_scores_TextRank_df.csv', index=False)

In [3]:
import pandas as pd
dataset_rouge_scores = pd.read_csv("rouge_scores_TextRank_df.csv")
dataset_rouge_scores

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
0,0.33463,0.102041,0.280156
1,0.269663,0.035211,0.258427
2,0.330935,0.088398,0.258993
3,0.240601,0.011429,0.195489
4,0.212454,0.04157,0.197802
5,0.301205,0.080586,0.289157
6,0.269058,0.052786,0.215247
7,0.338308,0.062893,0.288557
8,0.341969,0.123894,0.290155
9,0.294416,0.107023,0.243655


In [4]:
dataset_rouge_scores.mean()

ROUGE-1    0.327824
ROUGE-2    0.093777
ROUGE-L    0.279922
dtype: float64

In [9]:
dataset_rouge_scores.std()

ROUGE-1    0.070101
ROUGE-2    0.050272
ROUGE-L    0.059092
dtype: float64

In [8]:
import pandas as pd

data = {'ROUGE-1': [0.327824],
        'ROUGE-2': [0.093777],
        'ROUGE-L': [0.279922]}

dataset_rouge_scores_mean = pd.DataFrame(data, index=['media'])

dataset_rouge_scores_mean

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
media,0.327824,0.093777,0.279922


In [12]:
import pandas as pd

dataset_rouge_scores = pd.read_csv("rouge_scores_TextRank_df.csv")

rouge_1_data = dataset_rouge_scores["ROUGE-1"]
rouge_2_data = dataset_rouge_scores["ROUGE-2"]
rouge_l_data = dataset_rouge_scores["ROUGE-L"]

rouge_1_stats = rouge_1_data.describe()

rouge_2_stats = rouge_2_data.describe()

rouge_l_stats = rouge_l_data.describe()

stats_df = pd.DataFrame({
    "ROUGE-1": rouge_1_stats,
    "ROUGE-2": rouge_2_stats,
    "ROUGE-L": rouge_l_stats
})

stats_df

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
count,20.0,20.0,20.0
mean,0.327824,0.093777,0.279922
std,0.070101,0.050272,0.059092
min,0.212454,0.011429,0.195489
25%,0.283518,0.054053,0.240174
50%,0.318068,0.083784,0.278606
75%,0.349334,0.126106,0.294684
max,0.482759,0.193548,0.388571
