## LSA (Latent Semantic Analysis) 

In [1]:
import pandas as pd
from rouge import Rouge
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import sent_tokenize

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")
df_ref

Unnamed: 0,article,abstract,article_length,abstract_length,top_words,similarity,similarity_percentage
0,direct central nervous system ( cns ) involvem...,a 25-year - old male patient presented to our ...,1394,293,"['of', 'and', 'the', 'in', 'with', 'a', 's', '...",0.955112,95.511218
1,primary nocturnal enuresis is intermittent noc...,several therapeutic options have been describe...,2014,247,"['the', 'in', 'of', 'and', 'desmopressin', 'to...",0.882804,88.280379
2,guillain - barr syndrome ( gbs ) is an immune ...,psychiatric symptoms in guillain - barr syndro...,4195,108,"['and', 'of', 'the', 'her', 'to', 'with', 'was...",0.877994,87.799449
3,progress in three - dimensional ( 3d ) echocar...,this review covers the role of three - dimensi...,4951,121,"['the', 'and', 'of', 'in', 'with', 'a', 'valve...",0.905861,90.586089
4,municipal solid waste workers ( mswws ) or ref...,background : solid waste management has emerge...,6167,375,"['and', 'the', 'of', 'to', 'waste', 'in', 'wor...",0.872562,87.256238
5,stroke is a clinical emergency that commonly r...,to assess the stroke workload of italian neuro...,7252,220,"['the', 'of', 'and', 'stroke', 'in', 'nu', 'fo...",0.748964,74.896378
6,chronic kidney disease ( ckd ) is one of the m...,glomerular filtration rate ( gfr ) is still a ...,4526,227,"['the', 'of', 'and', 'a', 'to', 'in', 'gfr', '...",0.866245,86.62448
7,"over the past decade , it has increasingly bec...",study backgroundpatients who experience a recu...,1344,268,"['the', 'of', 'a', 'patients', 'to', 'and', 'v...",0.936864,93.686421
8,prostate cancer is one of the most prevalent t...,"purposein clinical practice , atypical small a...",2279,279,"['the', 'in', 'of', 'cancer', 'and', 'prostate...",0.954418,95.441754
9,congenital cervical swelling constitutes a het...,an 18-year - old male patient presented with a...,841,146,"['the', 'of', 'in', 'was', 'a', 'and', 'cyst',...",0.926442,92.644246


In [3]:
# Objeto Rouge para calcular la métrica ROUGE
rouge = Rouge()

In [4]:
# Listas para almacenar los resultados de la métrica ROUGE
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

In [5]:
# Función para generar el resumen utilizando LSA
def generate_summary(article, num_sentences):
    # TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([article])

    lsa_model = TruncatedSVD(n_components=100)
    lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

    article_sentences = sent_tokenize(article)
    article_scores = lsa_matrix.dot(lsa_matrix.T)
    top_sentence_indices = np.argsort(article_scores[0])[-num_sentences:]
    summary = ' '.join([article_sentences[i] for i in top_sentence_indices])
    
    return summary

In [6]:
# Se itera sobre cada fila del DataFrame
for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    generated_summary = generate_summary(article, num_sentences=3) 
    
    scores = rouge.get_scores(generated_summary, original_summary)
    
    rouge_1_scores.append(scores[0]['rouge-1']['f'])
    rouge_2_scores.append(scores[0]['rouge-2']['f'])
    rouge_l_scores.append(scores[0]['rouge-l']['f'])

  self.explained_variance_ratio_ = exp_var / full_var


In [7]:
# DataFrame para almacenar los resultados de ROUGE
rouge_scores_df = pd.DataFrame({
    'ROUGE-1': rouge_1_scores,
    'ROUGE-2': rouge_2_scores,
    'ROUGE-L': rouge_l_scores
})

In [9]:
rouge_scores_df

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L
0,0.211055,0.066445,0.190955
1,0.177778,0.018957,0.133333
2,0.213592,0.111111,0.194175
3,0.22,0.104478,0.2
4,0.205128,0.068111,0.184615
5,0.253521,0.061404,0.239437
6,0.173913,0.02439,0.136646
7,0.232044,0.056939,0.165746
8,0.205128,0.052239,0.153846
9,0.302521,0.05988,0.252101


In [10]:
# Para posteriores cálculos se extrae el dataframe 'rouge_scores_LSA_df'
rouge_scores_df.to_csv('rouge_scores_LSA_df.csv', index=False)

In [8]:
average_rouge_scores = rouge_scores_df.mean()
print("Promedio de métricas ROUGE para todo el dataset:")
print(average_rouge_scores)

Promedio de métricas ROUGE para todo el dataset:
ROUGE-1    0.209074
ROUGE-2    0.058214
ROUGE-L    0.182587
dtype: float64
