# **Text Summarization for Extractive model- TextRank.**

**Import Necessary Packages**

In [12]:
!pip install datasets
!pip install rouge
!pip install rouge-score



**Import all Libraries**

In [13]:
import torch
import nltk
import numpy as np
import networkx as nx
import re
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from rouge import Rouge
from nltk.stem import WordNetLemmatizer
from logging import logThreads
from sklearn.metrics.pairwise import cosine_similarity

Data Set: Reddit_tifu

In [14]:
df = load_dataset("reddit_tifu", "long")

features = df["train"].features

for x, y in features.items():
    print(f"{x}: {y.dtype}")

ups: float32
num_comments: float32
upvote_ratio: float32
score: float32
documents: string
tldr: string
title: string


In [15]:
df = df.map(lambda element: {'summary': element['tldr'], 'text': element['documents']})
df = df.remove_columns(["ups", "upvote_ratio", "num_comments", "score", "title", "tldr", "documents"])

In [16]:
print(df["train"][:3])

{'summary': ['confuse a 5th grade girl for a boy in front of half of her class. kids are mean. sorry sandra.**', 'i found my estranged dad, thought i loved him after getting to know him, got to know him better and changed my mind.', 'had my balls burned by sauron and was left deveeted.'], 'text': ['this actually happened a couple of years ago. i grew up in germany where i went to a german secondary school that went from 5th to 13th grade (we still had 13 grades then, they have since changed that). my school was named after anne frank and we had a club that i was very active in from 9th grade on, which was dedicated to teaching incoming 5th graders about anne franks life, discrimination, anti-semitism, hitler, the third reich and that whole spiel. basically a day where the students\' classes are cancelled and instead we give them an interactive history and social studies class with lots of activities and games. \n\nthis was my last year at school and i already had a lot of experience do

**Downloading Stop Words**

In [17]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Below code is to initialize lemmatizer and also the stop words
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

# Below code is to tokenize input text into sentences by NLTK's sent_tokenize.
# And returns list of sentences.
def gen_sentences(input):
    return nltk.sent_tokenize(input)

# Below code is to clean and lemmatize a list of sentences.
# And returns list of sentences which are cleaned.
def clean_sentences(input):
  cs = []
  for sentence in input:
    sen = sentence.lower()
    sen = re.sub(r'http\S+|www\S+|https\S+', '', sen)
    sen = re.sub(r'\([^)]*\)', '', sen)
    sen = re.sub('"','', sen)
    sen = re.sub("[^a-zA-Z]", " ", sen)
    cs.append(lemmatizer.lemmatize(sen))
  return cs

# Below code is to stop words from a sentence
def delete_stopwords(input):
    sen_new = " ".join([i for i in input if i not in stop_words])
    return sen_new

**Downloading GloVe and unzipping the word embeddings dataset from Stanford.**

In [19]:
! wget http://nlp.stanford.edu/data/glove.6B.zip
! unzip glove*.zip

--2023-08-05 20:43:14--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-08-05 20:43:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-08-05 20:43:15--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [20]:
# Below code is to store word embeddings
word_embeddings = {}

# Below code is to open the GloVe word embeddings file for reading
with open('glove.6B.100d.txt', encoding='utf-8') as file:
    for line in file:
        parts = line.split()
        word, vector = parts[0], np.asarray(parts[1:], dtype='float32')
        word_embeddings[word] = vector

In [21]:
# Below code is for text summary.
def text_summary(text, summary_percentage):
    sen = gen_sentences(text)
    c_sen = clean_sentences(sen)
    c_sen = [delete_stopwords(r.split()) for r in c_sen]

    sentence_vectors = []
    for sentence in c_sen:
        words = sentence.split()
        vector_sum = np.zeros((100,))

        if len(words) > 0:
            for word in words:
                vector_sum += word_embeddings.get(word, np.zeros((100,)))

            avg_vector = vector_sum / len(words)
        else:
            avg_vector = vector_sum

        sentence_vectors.append(avg_vector)

    len_sen = len(sen)
    matrix = np.zeros([len_sen, len_sen])

# Below code is to calculate cosine similarity scores between the sentence vectors
    for i in range(len_sen):
      for j in range(len_sen):
        if i != j:
          sc = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
          matrix[i][j] = sc

    graph_matrix = nx.from_numpy_array(matrix)
    sentence_scores = nx.pagerank(graph_matrix)

    ranked_sentences = sorted(((sentence_scores[i],s) for i,s in enumerate(sen)), reverse=True)

    num_sentences = max(1, int(summary_percentage * len(ranked_sentences)))

    top_sentences = [sentence for _, sentence in ranked_sentences[:num_sentences]]

    return " ".join(top_sentences)

In [22]:
# In below code we are initializing empty lists to store predicted and reference summaries
predicted_summaries = []
ref_summaries = []

# Below code is to calculate predicted and reference summaries
def calculation(df, num_rows, summary_percentage):
    predicted_summaries = []
    ref_summaries = []
# Below code is to iterate over each row in the DataFrame
    for i in range(num_rows):
        ref_summaries.append(df['summary'][i])

        p_summary = text_summary(df['text'][i], summary_percentage=summary_percentage)
        predicted_summaries.append(p_summary)

    return predicted_summaries, ref_summaries


**Evaluation: Rouge Score**

In [23]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(pred, ori):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(pred, ori) for pred, ori in zip(pred, ori)]
    return rouge_scores

pred_s, ref_s = calculation(df["train"], 100, 0.1)
rouge_scores = calculate_rouge_scores(pred_s, ref_s)

average_rouge1 = sum(scores['rouge1'].fmeasure for scores in rouge_scores) / len(rouge_scores)
average_rouge2 = sum(scores['rouge2'].fmeasure for scores in rouge_scores) / len(rouge_scores)
average_rougeL = sum(scores['rougeL'].fmeasure for scores in rouge_scores) / len(rouge_scores)

# Print the average ROUGE scores
print(f"Average ROUGE-1 score: {average_rouge1:.4f}")
print(f"Average ROUGE-2 score: {average_rouge2:.4f}")
print(f"Average ROUGE-L score: {average_rougeL:.4f}")

Average ROUGE-1 score: 0.1697
Average ROUGE-2 score: 0.0202
Average ROUGE-L score: 0.1130
