
# Libraries

In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution

from sklearn.metrics.pairwise import cosine_similarity


import networkx as nx

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Extracting the word embeddings

- glove 100d

In [None]:
! wget http://nlp.stanford.edu/data/glove.6B.zip

! unzip glove*.zip

--2023-12-20 19:27:35--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-12-20 19:27:35--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-12-20 19:27:36--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filepath = '...'

df = pd.read_csv(filepath + '/dati_TM.csv')
df.head()

Unnamed: 0,id,article,highlights
0,c93a25175d42553bd05e8a61bae35abb23368d05,"Bucharest, Romania (CNN) -- Romania's leftist ...",Leftist government files a motion with parliam...
1,8090057ca58ec0c7a752f0ac4fdd74954e5a6aa5,"LONDON, England (CNN) -- London's newest hotel...",Entrepreneur braves recession to open rock'n'r...
2,7ba3aa10fa2d1623ac68cdd1bf2f3fde94a6e860,Los Angeles (CNN) -- Actress Lindsay Lohan sho...,Lohan could have waited until June 17 to start...
3,b1bf2c63663503ca0951a08dc84cd97a593d857d,By . Associated Press . PUBLISHED: . 11:30 EST...,Gunman killed 20 first-graders and six teachin...
4,c5cd24a690fb6e97f836a72cd36277bbdc172aca,"(CNN) -- Finally, fans of the world's most fam...","'Harry Potter' books finally come to Kindle, N..."


# Extractive summarization with **text rank**

To perform the TextRank algorithm, a set of functions were designed to:

1. process the articles in sentences
2. removing the stop words and create an intermediate representation
3. computing the similarity matrix between sentences
4. perform the **pagerank algorithm** to rank each sentence
5. Select the sentences that will make the final sumamry

0. Loading the stopwords and design a function to remove them

In [None]:
#Stopwords
nltk.download('stopwords')# one time execution

from nltk.corpus import stopwords
stop_words = stopwords.words('english')


# function to remove stopwords
def remove_stopwords(sentence):
  sentence_new = " ".join([i for i in sentence if i not in stop_words])
  return sentence_new

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1. Design the function `prepare_article(text)` which starting from a single article

  - it performs the sentence tokenization;
  - replace any character in the variable sentence that is not an uppercase letter (A-Z), a lowercase letter (a-z), or a digit (0-9) with a space (" ");
  - Lowercase each word;
  - Removes the stopwords.

  And outputs two lists of sentences:

  1. The sentences unprocessed (just tokenized); this list will be used once computed the sentence ranking for composing the final summary of the article.

  2. The processed sentences, which will be used in the netxt step for the intermediate representation.

In [None]:
def prepare_article(text):
  #Sentence tokenization
  sentences=[]
  sentences = sent_tokenize(text)

  #1_Remove punctuation, numbers and special characters
  clean_sentences_1 = pd.Series(sentences).str.replace("[^a-zA-Z]", " ", regex=True)

  #2_casefolding
  clean_sentences_2 = [sentence.lower() for sentence in clean_sentences_1]

  #3_Stopwords removal
  clean_sentences_3 = [ remove_stopwords(sentence.split()) for sentence in clean_sentences_2]

  #Output, the unprocessed sentences list + the processed sentences list
  return sentences, clean_sentences_3

2. Compute the intermidiate representation, which consists on computing the similarity matrix between sentences.

- This is done by computing the cosine-similarity between vector sentences, obtained through the word embeddings (GLOVE) representation.

- Each word in the sentence, it is associated with its GLOVE 100d word embedding.

- For computing the final vector for the sentence, the mean between all the word embeddings of each word inside the sentence, is computed.

Once obtained for each sentence its embedding, the cosine similarity between sentences is computed filling the final similarity matrix.

In [None]:
def similarity_matrix(cleaned_sentences):

  #Vector representation:

  sentence_vectors = []
  for i in cleaned_sentences:
    if len(i) != 0:
      # Compute vector for each sentence as the mean of the word embeddings
      v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
      # For empty sentences, use a zero vector
      v = np.zeros((100,))
    sentence_vectors.append(v)

  #Similarity matrix:

  # Initialize the similarity matrix
  M = np.zeros([len(cleaned_sentences), len(cleaned_sentences)])

  # Computing the similarities
  for i in range(len(cleaned_sentences)):
    for j in range(len(cleaned_sentences)):
      if i != j:
        M[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

  return M


This two functions can be included in the final function: `generate_summary(article, n_sentences)` which take as input the article and outputs the summary. The function can be sumemd up in the following steps:

1. Sentence_tokenization + text_processing.
2. 2 Vector representation + similarity across sentences.
3. Grapgh representation +  rank the sentences through the pagerank algorithm.
4. Sort the sentences according to the score computed.
5. Generate the summary by selecting the top `n_sentences`.



In [None]:
def generate_summary(text, n_sentences=3):
    stop_words = stopwords.words('english')
    summary = ""

    # 1 Sentence_tokenization + text_processing
    sentences, processed_sentences = prepare_article(text)

    # 2 Vector representation + similarity across sentences
    M = similarity_matrix(processed_sentences)

    # 3 Rank the sentences
    nx_graph =  nx.DiGraph(M)
    scores = nx.pagerank(nx_graph)

    # 4 Sort the sentences
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

    # 5 Generate the summary
    for i in range(n_sentences):
      #Taking care of articles with a number of sentences lower than n_sentences
        if i < len(ranked_sentence):
            summary += ranked_sentence[i][1]

    return summary

# An example:

In [None]:
text = df.iloc[0]
generate_summary(text["article"], n_sentences = 3)

'The coalition government has come under criticism recently for ordering that Romania\'s Cultural Institute report not to the president, as it had been doing, but to the Senate.The country\'s civil society and anti-corruption agencies addressed on Wednesday a letter to the president of the European Commission, Jose Manuel Barroso, expressing concern over this "unprecedented attack" on the rule of law.Ponta received Parliament\'s vote to go to Brussels on behalf of the country, but the Constitutional Court ruled that president Basescu had the right to attend the event.'

In [None]:
#ground truth
text.highlights

'Leftist government files a motion with parliament to suspend the president .\nThe motion marks the latest in a series of attacks between Basescu and Prime Minister Ponta .\nThe vote, scheduled for Friday, is expected to pass .'

# Performing the algorthm to all the dataset

In [None]:
articoli = df.article

In [None]:
sommari = [generate_summary(articolo) for articolo in articoli]

Save the results on a dataframe, and export the results

In [None]:
filepath='...'

df_final = pd.DataFrame({ "articles":df["article"], "ground_truth":df["highlights"], "summary_results": sommari })
df_final.to_csv(filepath+"extractive_summarization_complete.csv", index=False)