In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

# Carga del dataset
df_ref = pd.read_csv("df_ref.csv")

def generate_summary_kmeans(article, num_clusters, reference_summary):
    sentences = nltk.sent_tokenize(article)
    if num_clusters > len(sentences):
        return article, 0.0
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    
    centroid_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, X, metric='euclidean')[0]
    
    summary_sentences = [sentences[i] for i in centroid_indices]
    generated_summary = ' '.join(summary_sentences)
    
    reference_tokens = nltk.word_tokenize(reference_summary)
    hypothesis_tokens = nltk.word_tokenize(generated_summary)
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens)
    
    return generated_summary, bleu

bleu_scores = []

for index, row in df_ref.iterrows():
    article = row['article']
    reference_summary = row['abstract']
    
    generated_summary, bleu = generate_summary_kmeans(article, num_clusters=3, reference_summary=reference_summary)
    
    bleu_scores.append(bleu)

bleu_scores_df = pd.DataFrame({
    'BLEU': bleu_scores
})

pd.options.display.float_format = '{:.10f}'.format

print(bleu_scores_df)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscarandres.pinilla\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider

           BLEU
0  0.0000000000
1  0.0215594725
2  0.0584406286
3  0.0000000000
4  0.0000000000
5  0.0725946472
6  0.1327021398
7  0.0000000000
8  0.0547984782
9  0.0726925145
10 0.0000000000
11 0.0485787411
12 0.0000000000
13 0.0266010962
14 0.0061264444
15 0.0000000000
16 0.0000000000
17 0.0625641325
18 0.0000000000
19 0.0000000000


In [2]:
bleu_scores_df

Unnamed: 0,BLEU
0,0.0
1,0.0215594725
2,0.0584406286
3,0.0
4,0.0
5,0.0725946472
6,0.1327021398
7,0.0
8,0.0547984782
9,0.0726925145


In [3]:
# Para posteriores cálculos se extrae el dataframe 
bleu_scores_df.to_csv('BLEU_scores_Kmeans_df.csv', index=False)