In [1]:
import pandas as pd
import nltk
import numpy as np
from rouge import Rouge
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")

rouge = Rouge()

bleu_scores = []

def generate_summary(article, num_sentences):
    # TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([article])

    lsa_model = TruncatedSVD(n_components=100)
    lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

    article_sentences = sent_tokenize(article)
    article_scores = lsa_matrix.dot(lsa_matrix.T)
    top_sentence_indices = np.argsort(article_scores[0])[-num_sentences:]
    summary = ' '.join([article_sentences[i] for i in top_sentence_indices])
    
    return summary

for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    generated_summary = generate_summary(article, num_sentences=3) 
    
    hypothesis_tokens = word_tokenize(generated_summary)
    reference_tokens = word_tokenize(original_summary)
    
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens)
    
    bleu_scores.append(bleu)

bleu_scores_df = pd.DataFrame({
    'BLEU': bleu_scores
})

pd.options.display.float_format = '{:.10f}'.format

print(bleu_scores_df)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
  self.explained_variance_ratio_ = exp_var / full_var
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how man

           BLEU
0  0.0007277250
1  0.0000000000
2  0.0141682980
3  0.0042923562
4  0.0000091974
5  0.0015113701
6  0.0000000000
7  0.0012691017
8  0.0000000000
9  0.0125157877
10 0.0000000000
11 0.0070182728
12 0.0000000000
13 0.0000001804
14 0.0019937322
15 0.0000000000
16 0.0000000000
17 0.0480140617
18 0.0000000000
19 0.0000000000


In [2]:
bleu_scores_df

Unnamed: 0,BLEU
0,0.000727725
1,0.0
2,0.014168298
3,0.0042923562
4,9.1974e-06
5,0.0015113701
6,0.0
7,0.0012691017
8,0.0
9,0.0125157877


In [3]:
# Para posteriores cálculos se extrae el dataframe 
bleu_scores_df.to_csv('BLEU_scores_LSA_df.csv', index=False)

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')
nltk.download('stopwords')

# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")

bleu_scores = []

def generate_summary_lsa(article, num_sentences):
    sentences = sent_tokenize(article)
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    filtered_sentences = [' '.join([word for word in words if word.lower() not in stop_words]) for words in word_tokens]

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(filtered_sentences)

    # Aplicamos LSA
    lsa_model = TruncatedSVD(n_components=num_sentences, random_state=42)
    lsa_vectors = lsa_model.fit_transform(X)

    # Obtenemos los índices de las oraciones más relevantes según LSA
    top_sentence_indices = lsa_vectors.argsort()[:, -num_sentences:]

    summary = ' '.join([sentences[i] for i in top_sentence_indices.ravel()])

    return summary

for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    generated_summary = generate_summary_lsa(article, num_sentences=3) 
    
    hypothesis_tokens = word_tokenize(generated_summary)
    reference_tokens = word_tokenize(original_summary)
    
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens)
    
    bleu_scores.append(bleu)

bleu_scores_df = pd.DataFrame({
    'BLEU': bleu_scores
})

pd.options.display.float_format = '{:.10f}'.format

print(bleu_scores_df)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use S

           BLEU
0  0.0035135469
1  0.0000000000
2  0.0004276488
3  0.0015172178
4  0.0011233799
5  0.0011298705
6  0.0006139775
7  0.0048476089
8  0.0027525824
9  0.0030455333
10 0.0000000000
11 0.0043701194
12 0.0006217237
13 0.0030515849
14 0.0048884096
15 0.0008728348
16 0.0005401870
17 0.0013288313
18 0.0005875167
19 0.0000000000


In [3]:
bleu_scores_df

Unnamed: 0,BLEU
0,0.0035135469
1,0.0
2,0.0004276488
3,0.0015172178
4,0.0011233799
5,0.0011298705
6,0.0006139775
7,0.0048476089
8,0.0027525824
9,0.0030455333


In [2]:
# Para posteriores cálculos se extrae el dataframe 
bleu_scores_df.to_csv('BLEU_scores_LSA_df.csv', index=False)