# Simple TextRank implementation

In this notebook I load, preprocess xsum dataset, and create very simple TextRank based on PageRank.

## Imports

In [1]:
from datasets import load_dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import trange
import time
import re
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# stop_words = stopwords.words('english')1

2022-05-24 21:54:41.275255: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[nltk_data] Downloading package punkt to /home/pasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pasha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# import importlib
# import scipy
# import networkx as nx
# scipy = importlib.reload(scipy)
# nx = importlib.reload(nx)

### Prepare embedings for later

In [4]:
# create embedings for each word
word_embeddings = {}
f = open('../..//data/glove/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()


In [5]:
print(f"Size of embedings: {len(word_embeddings['hello'])}")
print(f"Embeding for hello: {word_embeddings['hello']}")

Size of embedings: 100
Embeding for hello: [ 0.26688    0.39632    0.6169    -0.77451   -0.1039     0.26697
  0.2788     0.30992    0.0054685 -0.085256   0.73602   -0.098432
  0.5479    -0.030305   0.33479    0.14094   -0.0070003  0.32569
  0.22902    0.46557   -0.19531    0.37491   -0.7139    -0.51775
  0.77039    1.0881    -0.66011   -0.16234    0.9119     0.21046
  0.047494   1.0019     1.1133     0.70094   -0.08696    0.47571
  0.1636    -0.44469    0.4469    -0.93817    0.013101   0.085964
 -0.67456    0.49662   -0.037827  -0.11038   -0.28612    0.074606
 -0.31527   -0.093774  -0.57069    0.66865    0.45307   -0.34154
 -0.7166    -0.75273    0.075212   0.57903   -0.1191    -0.11379
 -0.10026    0.71341   -1.1574    -0.74026    0.40452    0.18023
  0.21449    0.37638    0.11239   -0.53639   -0.025092   0.31886
 -0.25013   -0.63283   -0.011843   1.377      0.86013    0.20476
 -0.36815   -0.68874    0.53512   -0.46556    0.27389    0.4118
 -0.854     -0.046288   0.11304   -0.27326   

## Load dataset

In [6]:
# load dataset
ds = load_dataset("xsum", "default", keep_in_memory=True)

Using custom data configuration default
Reusing dataset xsum (/home/pasha/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# checkout dataset
# 1. note - size of summary is always 1 sentence
print(ds)
print(ds['train']['summary'][3])

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})
A former Lincolnshire Police officer carried out a series of sex attacks on boys, a jury at Lincoln Crown Court was told.


In [10]:
# Let's try to merge train and val and test
def get_docs_sums():
    documents = []
    summaries = []
    for part in ['train', 'validation', 'test']:
        print(f"Part: {part}")
        documents.extend(ds[part]['document'])
        summaries.extend(ds[part]['summary'])
    return documents, summaries

documents, summaries = get_docs_sums()

Part: train
Part: validation
Part: test


In [15]:
# for i in trange(100):
#     if "@" in documents[i]:
#         print(i)
#         break
#
# documents[0]

In [9]:
# now let's divide documents into sentences
# splitted_docs = []
for di in trange(len(documents)):
    documents[di] = nltk.tokenize.sent_tokenize(documents[di])


  4%|▎         | 8304/226711 [00:02<01:16, 2848.91it/s]


KeyboardInterrupt: 

In [None]:
for i in range(3):
    print(f"Number of sentences in document {i}: {len(documents[i])}")

## Very basic text preprocessing

In [10]:
def preprocess_ds(func):
   for di in trange(len(documents)):
       for si, sentence in enumerate(documents[di]):
           documents[di][si] = func(sentence)

In [11]:
# 1. remove non ascii symbols
preprocess_ds(lambda s: re.sub(r"$[^a-zA-Z]", " ", s))

100%|██████████| 226711/226711 [00:07<00:00, 30702.94it/s]


In [12]:
# 2. To lowercase
preprocess_ds(lambda s: s.lower())

100%|██████████| 226711/226711 [00:01<00:00, 226060.14it/s]


In [13]:
# 3. remove stopwords
def remove_stopwords(sen: str):
    sen_new = " ".join([i for i in sen.split() if i not in stop_words])
    return sen_new

preprocess_ds(remove_stopwords)

# has_not = False
# for sen in documents[0]:
#     has_not = has_not or "not" in sen
# print(f"Has not: {has_not}")

100%|██████████| 226711/226711 [03:06<00:00, 1212.80it/s]


In [15]:
orig_documents, _ = get_docs_sums()
for di in trange(len(orig_documents)):
    orig_documents[di] = nltk.tokenize.sent_tokenize(orig_documents[di])

Part: train
Part: validation
Part: test


100%|██████████| 226711/226711 [01:23<00:00, 2719.46it/s]


In [16]:
print(orig_documents[0][0])

The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.


In [17]:
print(documents[0][0])

full cost damage newton stewart, one areas worst affected, still assessed.


In [18]:
# 4. Vectorize with glove
def glove_vectorize(sentence):
    if len(sentence) != 0:
        return sum([word_embeddings.get(w, np.zeros((100,))) for w in sentence.split()]) / (len(sentence.split()) + 0.001)
    return np.zeros((100,))

preprocess_ds(glove_vectorize)

100%|██████████| 226711/226711 [01:10<00:00, 3215.20it/s]


## Algorithm core

In [25]:
def text_rank_summary(document, orig_document, n=1):
    sim_mat = np.zeros([len(document), len(document)])
    for i in range(len(document)):
        for j in range(len(document)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(document[i].reshape(1,100), document[j].reshape(1,100))[0,0]
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    # print(list(scores.items()))
    # scores = scores.values()
    sort_order = sorted(scores.keys(), reverse=True, key=lambda i: scores[i])
    # print(sort_order)
    return [orig_document[k] for k in sort_order[:n]]

    # ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(document)), reverse=True)
    # return ranked_sentences[:n]

In [26]:
print(text_rank_summary(documents[0], orig_documents[0], 1))
print(text_rank_summary(documents[1], orig_documents[1], 1))
print(text_rank_summary(documents[2], orig_documents[2], 1))
# print(orig_documents[:2])

['The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.']
['Insp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.']
['Ferrari appeared in a position to challenge until the final laps, when the Mercedes stretched their legs to go half a second clear of the red cars.']


## Gensim - use TextRank from lib

In [29]:
import gensim
from gensim.summarization.pagerank_weighted import pagerank_weighted

ModuleNotFoundError: No module named 'gensim.summarization'

In [None]:
# let's count average size of summarization
# ratios = []
# word_counts =

In [None]:
# print(summarize(orig_documents[0]))
# print(summarize(orig_documents[1]))
# print(summarize(orig_documents[2]))