# Rouge-L score prediction via regression

The goal is to use spacy in order to identify word tags in sentences, and use the resulting parsing in order to find the sentence that best summarizes the text it is from.

In [None]:
import os

os.chdir("..")

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge, ElasticNet
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F

from src.metrics import single_rouge_score

In [None]:
from src.load_data import load_data

train_df, validation_df, test_df = load_data()
train_df.describe()

In [None]:
# Count tokens appearing in relevant & irrelevant sentences in order to balance the scores of each tag
relevant_tag_count = 0
irrelevant_tag_count = 0

relevant_tag_counts: dict[str, int] = {}
irrelevant_tag_counts: dict[str, int] = {}


def text_to_sentences(text: str) -> list[str]:
    return [s.strip() for s in text.split(".")]

## SpaCy POS tagging

Using SpaCy to parse the text and identify the parts of speech in the text. The parts of speech are then used to identify the most important words in the text.

ISSUE: too long.

In [None]:
#!python -m spacy download fr_dep_news_trf

import spacy

tagger = spacy.load("fr_dep_news_trf")


def extract_tags(text: str, counter: dict[str, int]) -> int:
    """Add the found tags to the argument counter dictionary."""

    tags = tagger(text)
    for tag in tags:
        name = tag.pos_

        if name in counter:
            counter[name] += 1
        else:
            counter[name] = 1
    return len(tags)

For every text - target pair, we identify the sentence with the best Rouge-L score relative to the target, and we count for each token how much it appears in best sentences vs the other ones.

In [None]:
nrows = train_df.shape[0]

for _, (text, target) in tqdm(train_df.iterrows(), total=nrows):

    sentences = text_to_sentences(text)
    rouge_scores = [single_rouge_score(target, sentence)
                    for sentence in sentences]

    # Extract the index of the best sentence score
    best_sentence_index = rouge_scores.index(max(rouge_scores))

    # Count all tokens for all sentences. etc.
    for i, sentence in enumerate(sentences):
        if i == best_sentence_index:
            relevant_tag_count += extract_tags(sentence, relevant_tag_counts)
        else:
            irrelevant_tag_count += extract_tags(sentence,
                                                 irrelevant_tag_counts)

# Not viable: too long !

## Sentence & Paragraph Embeddings

We embed paragraphs and sentences using pretrained models. We then use a regressor from `scikit-learn` to predict the Rouge-L score of each sentence, and thus pick the best summarizing one using the max Rouge-L score.

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def load_or_compute(name: str, df: pd.DataFrame):

    embeddings_filename = f"data/{name}_embeddings.npy"
    scores_filename = f"data/{name}_scores.npy"

    if os.path.exists(embeddings_filename) and os.path.exists(scores_filename):
        return np.load(embeddings_filename), np.load(scores_filename)

    # Else: do and save
    scores: list[float] = []
    final_embeddings = []

    nrows = train_df.shape[0]

    for _, (text, target) in tqdm(train_df.iterrows(), total=nrows):

        # Extract sentences
        sentences = text_to_sentences(text)

        # Compute Rouge-L scores relative to the target
        rouge_scores = [single_rouge_score(
            target, sentence) for sentence in sentences]
        scores.extend(rouge_scores)

        # Compute embeddings
        sentence_embeddings = np.array(model.encode(sentences))
        paragraph_embeddings = np.array(model.encode(text))

        # Stack sentence embeddings with their respective paragraph embedding,
        # into the global sentence_embeddings list
        repeated_paragraph = np.tile(paragraph_embeddings, (len(sentences), 1))
        embeddings = np.concatenate(
            (sentence_embeddings, repeated_paragraph), axis=1)

        final_embeddings.extend(embeddings)

    np.save(embeddings_filename, final_embeddings)
    np.save(scores_filename, scores)
    return np.array(final_embeddings), np.array(scores)

In [None]:
train_embed, train_scores = load_or_compute("train", train_df)

In [None]:
valid_embed, valid_scores = load_or_compute("valid", validation_df)

In [None]:
# Visualize shapes
print(train_embed.shape, train_scores.shape)
print(valid_embed.shape, valid_scores.shape)

In [None]:
# Try different regression models
ridge_reg = Ridge(alpha=1.0)
lasso_reg = Lasso(alpha=1.0)
elastic_reg = ElasticNet(alpha=1.0, l1_ratio=0.5)

In [None]:
ridge_reg.fit(train_embed, train_scores)
print(ridge_reg.score(valid_embed, valid_scores))

In [None]:
lasso_reg.fit(train_embed, train_scores)
print(lasso_reg.score(valid_embed, valid_scores))

In [None]:
elastic_reg.fit(train_embed, train_scores)
print(elastic_reg.score(valid_embed, valid_scores))

We can then test one of the regressors on the validation data in order to pick a sentence for each paragraph.

In [None]:
def pick_sentences(embeddings: np.ndarray, df: pd.DataFrame):
    """Given a dataset as DataFrame and precomputed embeddings, pick and test a sentence for each
    text to be summarized."""

    nrows = df.shape[0]
    best_sentences: list[str] = []

    # Embedding span pointers
    start = 0

    for _, (text, *_) in tqdm(df.iterrows(), total=nrows):

        # Extract sentences
        sentences = text_to_sentences(text)

        # Get embeddings for the current sentences
        sent_embeddings = embeddings[start: start + len(sentences)]

        # Predict the best sentence
        best_sentence_index = np.argmax(ridge_reg.predict(sent_embeddings))

        best_sentences.append(sentences[best_sentence_index])

        start += len(sentences)  # Move the pointer to the next span

    return best_sentences


def avg_score(sentences: list[str], targets: list[str]) -> float:
    scores = [
        single_rouge_score(target, sentence)
        for target, sentence in zip(targets, sentences)
    ]
    return float(np.mean(scores))

In [None]:
picked_sentences = pick_sentences(valid_embed, validation_df)
avg_score(picked_sentences, validation_df["titles"].tolist())

The best regressor gives an average Rouge-L score of 0.1157 on the validation set, which is not very good.

### Regression using a neural network

Instead, we will use a neural network with fully connected layers in order to predict the Rouge-L score of each sentence relative to their paragraph target.

In [None]:
class ScoreNN(nn.Module):
    """Rouge-L predictor"""

    def __init__(self, input_size: int, hidden_size: int) -> None:
        super().__init__()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x: Tensor) -> Tensor:
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.5) # Avoid overfitting
        x = self.fc2(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ScoreNN(input_size=train_embed.shape[1], hidden_size=256).to(device)

In [None]:
# Prepare training
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert numpy arrays to torch tensors
train_embed_t = torch.from_numpy(train_embed).float().to(device)
train_scores_t = torch.from_numpy(train_scores).float().to(device)

valid_embed_t = torch.from_numpy(valid_embed).float().to(device)
valid_scores_t = torch.from_numpy(valid_scores).float().to(device)

# Training loop
epochs = 100
batch_size = 10_000

for epoch in range(epochs):

    # Shuffle the data
    indices = torch.randperm(train_embed_t.size(0))

    for i in tqdm(range(0, train_embed_t.size(0), batch_size)):
        batch_indices = indices[i: i + batch_size]

        # Forward pass
        outputs = model(train_embed_t[batch_indices]).squeeze()

        # Compute loss
        loss = criterion(outputs, train_scores_t[batch_indices])

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loss
    # Issue: ça prend bcp trop de place à la fois ?
    # COmput
    # valid_outputs = model(valid_embed_t)
    # valid_loss = criterion(valid_outputs, valid_scores_t)
    # Compute validation loss by batch
    valid_loss = 0
    for i in range(0, valid_embed_t.size(0), batch_size):
        valid_outputs = model(valid_embed_t[i: i + batch_size]).squeeze()
        valid_loss += criterion(
            valid_outputs, valid_scores_t[i: i + batch_size]
        ).item()

    print(f"Epoch {epoch}, Loss: {loss.item()}, Validation loss: {valid_loss}")
    if epoch % 10 == 0:
        # Save the model
        torch.save(model.state_dict(), "data/score_nn.pth")

We can then use the same strategy as before to pick the best summarizing sentence.

In [None]:

nrows = validation_df.shape[0]
best_sentences: list[str] = []

# Embedding span pointers
start = 0

with torch.no_grad():
    for _, (text, *_) in tqdm(validation_df.iterrows(), total=nrows):

        # Extract sentences
        sentences = text_to_sentences(text)

        # Get embeddings for the current sentences
        sent_embeddings = valid_embed_t[start: start + len(sentences)]

        # Predict the best sentence
        best_sentence_index = np.argmax(
            model(sent_embeddings).squeeze().cpu().numpy())

        best_sentences.append(sentences[best_sentence_index])

        start += len(sentences)  # Move the pointer to the next span

print(len(best_sentences))

In [None]:
avg_score(best_sentences, validation_df["titles"].tolist())

Yet again, the score is not that great. It could be improved by using different embeddings and a larger model.