<a href="https://colab.research.google.com/github/ThisDavidAdams/MMR-summarization/blob/main/WCEP_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!git clone https://github.com/gandharvsuri/wcep-mds-dataset
%cd wcep-mds-dataset

In [None]:
!pip install -r experiments/requirements.txt
!python -m nltk.downloader punkt

In [None]:
!mkdir WCEP
!gdown https://drive.google.com/uc?id=1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1 -O WCEP/test.jsonl.gz

In [None]:
import experiments.utils as utils

test_data = list(utils.read_jsonl_gz('WCEP/test.jsonl.gz'))
# partial_test_data = test_data[:10] # for experimenting
print("Number of clusters:",len(test_data))
print(test_data[0].keys())

In [None]:
!pip install gensim==3.8.3

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec
import string 

doc2vec_model = Doc2Vec.load("/content/gdrive/MyDrive/ULETH/doc2vec/doc2vec.bin")

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')

def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lemmatize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words


In [None]:
def process_doc2vec_similarity(document, base_document):
    # Both pretrained models are publicly available at public repo of jhlau.
    # URL: https://github.com/jhlau/doc2vec

    # Only handle words that appear in the doc2vec pretrained vectors.
    # enwiki_dbow model contains 669549 vocabulary size.
    tokens = preprocess(base_document)
    tokens = list(filter(lambda x: x in doc2vec_model.wv.vocab.keys(), tokens))
    base_vector = doc2vec_model.infer_vector(tokens)

    tokens = preprocess(document)
    tokens = list(filter(lambda x: x in doc2vec_model.wv.vocab.keys(), tokens))
    vector = doc2vec_model.infer_vector(tokens)

    scores = cosine_similarity([base_vector], [vector]).flatten()[0]
    return scores

In [None]:
def compute_maximal_marginal_relevance(candidate_list, query, number_of_sentences=12, lambda_constant=0.75,
                                       sim=process_doc2vec_similarity):
    """
    hard coded to work for WCEP data
    """
    # Find best sentence to start
    initial_best_sentence = candidate_list[0]
    prev = float("-inf")

    for article in candidate_list:
        similarity = sim(article["text"], query)
        if similarity > prev:
            initial_best_sentence = article
            prev = similarity

    try:
        candidate_list.remove(initial_best_sentence)
    except ValueError:
        pass    # do nothing
    sentences_to_return = [initial_best_sentence]

    # Now find the prescribed number of best sentences
    for i in range(1, number_of_sentences):
        best_line = None
        previous_marginal_relevance = float("-inf")

        for article in candidate_list:
            # Calculate the Marginal Relevance
            left_side = lambda_constant * sim(article["text"], query)
            right_values = [float("-inf")]
            for selected_sentence in sentences_to_return:
                right_values.append((1 - lambda_constant) * sim(selected_sentence["text"], article["text"]))
            right_side = max(right_values)
            current_marginal_relevance = left_side - right_side

            # Maximize Marginal Relevance
            if current_marginal_relevance > previous_marginal_relevance:
                previous_marginal_relevance = current_marginal_relevance
                best_line = article
        
        if best_line is not None:
          sentences_to_return += [best_line]
          candidate_list.remove(best_line)

    return sentences_to_return

In [None]:
from tqdm import tqdm
test_data = test_data[:500]
for c in tqdm(test_data):

  base_doc = c["summary"]

  for a in c["articles"]:
    a["doc2vec_sim_score"] = str(process_doc2vec_similarity(a["text"], base_doc))
  
  c["articles"] = sorted(c["articles"], key=lambda a: float(a["doc2vec_sim_score"]), reverse=True)[:30]
  
  c["articles"] = compute_maximal_marginal_relevance(c["articles"], base_doc)

utils.write_jsonl(test_data, "/content/gdrive/MyDrive/ULETH/test_data.jsonl")