In [2]:
# Import dependencies
import numpy as np
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
import networkx as nx
from nltk.tokenize import sent_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Init 
nlp = spacy.load("en_core_web_sm")

In [9]:
# Functions used in code

def build_sentence_matrix(preprocessed_sentences):
    vect = TfidfVectorizer()
    X = vect.fit_transform(preprocessed_sentences)
    return X.toarray()


def preprocess_text(text):
    # Usuwamy "(CNN)", "--", "''"
    text = text.replace("(CNN)", "").replace("--", "").replace("''", '"')
    # Dzielimy tekst na zdania
    sentences = sent_tokenize(text)

    preprocessed = []
    for sent in sentences:
        doc = nlp(sent.lower())
        cleaned_words = []
        for token in doc:
            if token.is_alpha and not token.is_stop and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}:
                cleaned_words.append(token.lemma_)
        preprocessed.append(" ".join(cleaned_words))

    return preprocessed, sentences

def textrank_with_embedding(article_text , embedding_type: str = "tfidf", num_sentences=5,
                            damping_factor=0.85, similarity_threshold=0.1,
                            max_iter=100, tol=1e-6, max_input_sentences=None):
    # 1. Preprocess
    preprocessed, original_sentences = preprocess_text(article_text)

    # 2. Przycięcie
    if (max_input_sentences is not None) and max_input_sentences<=len(preprocessed):
        preprocessed = preprocessed[:max_input_sentences]
        original_sentences = original_sentences[:max_input_sentences]

    # 3. Budujemy macierz embeddingów / TF-IDF
    sentence_matrix = build_sentence_matrix(preprocessed)

    # 4. Kosinusowa macierz podobieństw
    

    sim_matrix = cosine_similarity(sentence_matrix, sentence_matrix)
    np.fill_diagonal(sim_matrix, 0.0)
    sim_matrix[sim_matrix < similarity_threshold] = 0.0

    # 6. Zbuduj graf i PageRank
    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph, alpha=damping_factor, max_iter=max_iter, tol=tol)

    # 7. Sortowanie zdań po PageRank i wybór top num_sentences
    ranked = sorted(((score, idx) for idx, score in scores.items()), reverse=True)
    top_count = min(num_sentences, len(original_sentences))
    top_idxs = [idx for (_score, idx) in ranked[:top_count]]
    top_idxs.sort()

    # 8. Połącz oryginalne zdania i zwróć
    summary = " ".join([original_sentences[i] for i in top_idxs])
    return summary

def evaluate_embeddings(articles,highlights, num_sentences=5,
                        damping_factor=0.85, similarity_threshold=0.1,
                        max_iter=100, tol=1e-6):
    rouge = Rouge()
    results = []
    num_articles = len(articles)
    for i in range(num_articles):
        article = articles[i]
        highlight = highlights[i]

        summary = textrank_with_embedding(
            article, num_sentences=num_sentences,
            damping_factor=damping_factor, similarity_threshold=similarity_threshold,
            max_iter=max_iter, tol=tol, max_input_sentences=None
        )

        scores = rouge.get_scores(summary, highlight)[0]
        results.append({
            "rouge-1-f": scores["rouge-1"]["f"],
            "rouge-2-f": scores["rouge-2"]["f"],
            "rouge-l": scores["rouge-l"]["f"]
        })

        #if (i + 1) % 10 == 0:
        #    print(f"  • {i+1}/{num_articles} artykułów przetworzono")
    return(results)

## 1. learn about the dataset 

In [5]:
# Download dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [6]:
# Devide dataset for articles and highlights, we can use only 'train' part of dataset, since it contains 287113 examples that is enough for our experiments
articles = dataset['train']['article']
highlights = dataset['train']['highlights']
print("Ammount of examples:",len(highlights))

Ammount of examples: 287113


In [7]:
# Show examples of a pair of article and highlight in dataset
print('-'*10,"Example 1",'-'*10)
print("Article example: \n",articles[0])
print("Highlight example: \n",highlights[0])
print('-'*10,"Example 2",'-'*10)
print("Article example: \n",articles[1])
print("Highlight example: \n",highlights[1])

---------- Example 1 ----------
Article example: 
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number o

In [8]:
# Calculate meadian ammout of sentences in result
len(sent_tokenize(highlights[0]))
sum = 0
for highlight in highlights[:1000]:
    sum = sum + len(sent_tokenize(highlight))
avg = sum/1000
print("Avarage of database highligh tokens is",avg)

Avarage of database highligh tokens is 3.525


In [None]:
# Multispace experiment:
optimisation_results = []
for damping_factor in np.arange(0.3,0.9,0.2):
    for max_iter in np.arange(200,1000,200):
        for tol in np.arange(0.5e-6,2.0e-6,0.5e-6):
            for num_sentences in [3,4,5,6]:
                optimisation_results.append({
                    'damping_factor': damping_factor,
                    'num_sentences': num_sentences,
                    'max_iter': max_iter,
                    'tol': tol,
                    'results_array': evaluate_embeddings(articles = articles[:100],highlights=highlights[:100], num_sentences=num_sentences,damping_factor=damping_factor, similarity_threshold=0.1, max_iter=max_iter, tol=tol)
                })

In [None]:
# I forgot to add num_sentences... 
num_sentences_array_temporary =[]
for damping_factor in np.arange(0.3,0.9,0.2):
    for max_iter in np.arange(200,1000,200):
        for tol in np.arange(0.5e-6,2.0e-6,0.5e-6):
            for num_sentences in [3,4,5,6]:
                num_sentences_array_temporary.append(num_sentences)
df['num_sentences'] = num_sentences_array_temporary

In [24]:
# Convert to df and save backup
df = pd.DataFrame(optimisation_results)
with open("experiment_3_results_backup.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
# Load backup when needed
with open("experiment_3_results_backup.pkl", "rb") as f:
    df_loaded = pickle.load(f)

In [None]:
# Ensure that You wont to load it as df, it will override earlier df
df = df_loaded

In [27]:
# Load only rouge-1-f
def mean_rouge1(results_array):
    sum = 0
    ammount = len(results_array)
    for i in range(ammount):
        sum = sum + results_array[i]["rouge-1-f"]
    return(sum/ammount)

def mean_rouge2(results_array):
    sum = 0
    ammount = len(results_array)
    for i in range(ammount):
        sum = sum + results_array[i]["rouge-2-f"]
    return(sum/ammount)

def mean_rougel(results_array):
    sum = 0
    ammount = len(results_array)
    for i in range(ammount):
        sum = sum + results_array[i]["rouge-l"]
    return(sum/ammount)

# Apply to each row
df['mean_rouge-1-f'] = df['results_array'].apply(mean_rouge1)
df['mean_rouge-2-f'] = df['results_array'].apply(mean_rouge2)
df['mean_rouge-l-f'] = df['results_array'].apply(mean_rougel)

In [57]:
df

Unnamed: 0,damping_factor,max_iter,tol,results_array,mean_rouge-1-f,mean_rouge-2-f,mean_rouge-l-f,num_sentences
0,0.3,200,5.000000e-07,"[{'rouge-1-f': 0.06896551239001224, 'rouge-2-f...",0.230459,0.067785,0.210916,3
1,0.3,200,5.000000e-07,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
2,0.3,200,5.000000e-07,"[{'rouge-1-f': 0.2857142817660694, 'rouge-2-f'...",0.240283,0.072724,0.220802,5
3,0.3,200,5.000000e-07,"[{'rouge-1-f': 0.3918918882103727, 'rouge-2-f'...",0.238853,0.074140,0.220536,6
4,0.3,200,1.000000e-06,"[{'rouge-1-f': 0.06896551239001224, 'rouge-2-f...",0.230459,0.067785,0.210916,3
...,...,...,...,...,...,...,...,...
187,0.9,800,1.000000e-06,"[{'rouge-1-f': 0.3918918882103727, 'rouge-2-f'...",0.243219,0.078162,0.226318,6
188,0.9,800,1.500000e-06,"[{'rouge-1-f': 0.12371133553831455, 'rouge-2-f...",0.243381,0.071869,0.222474,3
189,0.9,800,1.500000e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.242352,0.072861,0.221538,4
190,0.9,800,1.500000e-06,"[{'rouge-1-f': 0.2857142817660694, 'rouge-2-f'...",0.244297,0.076107,0.225354,5


In [None]:
# How to show for parameters
df[(df['max_iter'] == 200) & (df['damping_factor'] == 0.5)]['mean_rouge-1-f']

48    0.232626
49    0.239452
50    0.241557
51    0.237633
52    0.232626
53    0.239452
54    0.241557
55    0.237633
56    0.232626
57    0.239452
58    0.241557
59    0.237633
Name: mean_rouge-1-f, dtype: float64

In [75]:
!pip install pyqt6
%matplotlib qt

Collecting pyqt6
  Downloading pyqt6-6.9.1-cp39-abi3-win_amd64.whl.metadata (2.2 kB)
Collecting PyQt6-sip<14,>=13.8 (from pyqt6)
  Downloading pyqt6_sip-13.10.2-cp310-cp310-win_amd64.whl.metadata (515 bytes)
Collecting PyQt6-Qt6<6.10.0,>=6.9.0 (from pyqt6)
  Downloading pyqt6_qt6-6.9.1-py3-none-win_amd64.whl.metadata (551 bytes)
Downloading pyqt6-6.9.1-cp39-abi3-win_amd64.whl (25.7 MB)
   ---------------------------------------- 0.0/25.7 MB ? eta -:--:--
   - -------------------------------------- 1.0/25.7 MB 7.1 MB/s eta 0:00:04
   ------ --------------------------------- 3.9/25.7 MB 11.2 MB/s eta 0:00:02
   --------- ------------------------------ 6.3/25.7 MB 11.4 MB/s eta 0:00:02
   ------------- -------------------------- 8.7/25.7 MB 11.4 MB/s eta 0:00:02
   ----------------- ---------------------- 11.0/25.7 MB 11.7 MB/s eta 0:00:02
   -------------------- ------------------- 13.4/25.7 MB 11.7 MB/s eta 0:00:02
   ------------------------ --------------- 16.0/25.7 MB 11.7 MB/s eta 0

In [96]:
df[(df['tol'] == 0.5e-6) &(df['num_sentences'] == 4)]['mean_rouge-1-f']

1      0.236761
13     0.236761
25     0.236761
37     0.236761
49     0.239452
61     0.239452
73     0.239452
85     0.239452
97     0.237748
109    0.237748
121    0.237748
133    0.237748
145    0.242352
157    0.242352
169    0.242352
181    0.242352
Name: mean_rouge-1-f, dtype: float64

In [110]:
df[(df['num_sentences'] == 4)]

Unnamed: 0,damping_factor,max_iter,tol,results_array,mean_rouge-1-f,mean_rouge-2-f,mean_rouge-l-f,num_sentences
1,0.3,200,5e-07,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
5,0.3,200,1e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
9,0.3,200,1.5e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
13,0.3,400,5e-07,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
17,0.3,400,1e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
21,0.3,400,1.5e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
25,0.3,600,5e-07,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
29,0.3,600,1e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
33,0.3,600,1.5e-06,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4
37,0.3,800,5e-07,"[{'rouge-1-f': 0.12844036254860716, 'rouge-2-f...",0.236761,0.071117,0.218482,4


In [155]:
def plot_results_3d(df_filtered,param_X_name,param_Y_name,metric_name):
    # Extract data from DataFrame
    param_X = df_filtered[param_X_name]
    param_Y = df_filtered[param_Y_name]
    mean_rouge_1_f = df_filtered[metric_name]

    fig = plt.figure()
    ax = plt.axes(projection="3d")  
    sc1 = ax.scatter(param_X, param_Y, mean_rouge_1_f, c=mean_rouge_1_f, cmap='viridis')
    # Plot 1: damping_factor vs max_iter with color by rouge-1-f
    ax.set_xlabel(param_X_name)
    ax.set_ylabel(param_Y_name)
    ax.set_zlabel(metric_name)
    ax.view_init(elev=90, azim=0)
    fig.colorbar(sc1, ax=ax, shrink=0.5, aspect=5)
    plt.show()

def plot_results_2d(df_filtered,param_X_name,metric_name):
    # Extract data from DataFrame
    param_X = df_filtered[param_X_name]
    mean_rouge_1_f = df_filtered[metric_name]
    fig = plt.figure()
    ax = plt.axes()  
    sc1 = ax.plot(param_X, mean_rouge_1_f)
    # Plot 1: damping_factor vs max_iter with color by rouge-1-f
    ax.set_xlabel(param_X_name)
    ax.set_ylabel(metric_name)
    plt.show()
#(& ['damping_factor'] == 0.5) & (df['max_iter'] == 400) & (df['tol'] == 1.0e-06) & (df['num_sentences'] == 4)]


## Observations

In [156]:
plot_results_3d(df[(df['max_iter'] == 400) & (df['tol'] == 1.0e-06)],'damping_factor','num_sentences','mean_rouge-1-f')


In [150]:
plot_results_3d(df[(df['num_sentences'] == 4) & (df['max_iter'] == 400)],'damping_factor','tol','mean_rouge-1-f')

In [167]:
plot_results_3d(df[(df['tol'] == 1.0e-06) & (df['num_sentences'] == 4)],'damping_factor','max_iter','mean_rouge-1-f')

In [None]:
plot_results_3d(df[(df['tol'] == 1.0e-06) & (df['num_sentences'] == 4)],'tol','num_sentences','mean_rouge-1-f')

In [171]:
plot_results_3d(df[(df['max_iter'] == 400) & (df['tol'] == 0.5e-06)],'damping_factor','num_sentences','mean_rouge-1-f')