In [19]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')
nltk.download('stopwords')

# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")

bleu_scores = []

def generate_summary(article, num_sentences):
    sentences = sent_tokenize(article)
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    filtered_sentences = [' '.join([word for word in words if word.lower() not in stop_words]) for words in word_tokens]

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(filtered_sentences)

    similarity_matrix = X.dot(X.T)

    scores = similarity_matrix.sum(axis=0)
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i])[-num_sentences:]
    
    summary = ' '.join([sentences[i] for i in top_sentence_indices])
    
    return summary

for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    generated_summary = generate_summary(article, num_sentences=3) 
    
    hypothesis_tokens = word_tokenize(generated_summary)
    reference_tokens = word_tokenize(original_summary)
    
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens)
    
    bleu_scores.append(bleu)

bleu_scores_df = pd.DataFrame({
    'BLEU': bleu_scores
})

pd.options.display.float_format = '{:.10f}'.format

print(bleu_scores_df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider usi

           BLEU
0  0.0007277250
1  0.0000000000
2  0.0141682980
3  0.0042923562
4  0.0000091974
5  0.0015113701
6  0.0000000000
7  0.0012691017
8  0.0000000000
9  0.0125157877
10 0.0000000000
11 0.0070182728
12 0.0000000000
13 0.0000001804
14 0.0019937322
15 0.0000000000
16 0.0000000000
17 0.0480140617
18 0.0000000000
19 0.0000000000


In [20]:
bleu_scores_df

Unnamed: 0,BLEU
0,0.000727725
1,0.0
2,0.014168298
3,0.0042923562
4,9.1974e-06
5,0.0015113701
6,0.0
7,0.0012691017
8,0.0
9,0.0125157877


In [21]:
# Para posteriores cálculos se extrae el dataframe 
bleu_scores_df.to_csv('BLEU_scores_TextRank_df.csv', index=False)

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from summa import summarizer

nltk.download('punkt')

# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")

bleu_scores = []

for index, row in df_ref.iterrows():
    article = row['article']
    original_summary = row['abstract']
    
    # Genera el resumen usando TextRank
    generated_summary = summarizer.summarize(article, ratio=0.2)  # Puedes ajustar la proporción según lo desees
    
    # Tokenización del resumen generado y del resumen original
    hypothesis_tokens = word_tokenize(generated_summary)
    reference_tokens = word_tokenize(original_summary)
    
    # Cálculo de BLEU Score
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens)
    
    bleu_scores.append(bleu)

bleu_scores_df = pd.DataFrame({
    'BLEU': bleu_scores
})

pd.options.display.float_format = '{:.10f}'.format

print(bleu_scores_df)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


           BLEU
0  0.0748935423
1  0.1529597873
2  0.0346570319
3  0.0156254953
4  0.0234341730
5  0.0266479114
6  0.0796730365
7  0.1386817566
8  0.2168152423
9  0.0784953590
10 0.0510081553
11 0.0522166009
12 0.0450548765
13 0.1517918626
14 0.1260095406
15 0.1015122026
16 0.0158863685
17 0.0379230741
18 0.0562689669
19 0.0505752050


In [2]:
bleu_scores_df

Unnamed: 0,BLEU
0,0.0748935423
1,0.1529597873
2,0.0346570319
3,0.0156254953
4,0.023434173
5,0.0266479114
6,0.0796730365
7,0.1386817566
8,0.2168152423
9,0.078495359


In [None]:
# Para posteriores cálculos se extrae el dataframe 
bleu_scores_df.to_csv('BLEU_scores_TextRank_df.csv', index=False)