In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import gzip
import shutil
import warnings
import psutil
from psutil import virtual_memory
import time
from collections.abc import Mapping
from gensim.models import Word2Vec, KeyedVectors

In [None]:
gn_vec_path = "GoogleNews-vectors-negative300.bin"
gn_vec_zip_path = "/content/drive/My Drive/Model/GoogleNews-vectors-negative300.bin.gz"

#Extracting the required model
with gzip.open(gn_vec_zip_path, 'rb') as f_in:
    with open(gn_vec_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Model at {gn_vec_path}")

Model at GoogleNews-vectors-negative300.bin


In [None]:
#This module ignores the various types of warnings generated
warnings.filterwarnings("ignore")

#This module helps in retrieving information on running processes and system resource utilization
process = psutil.Process(os.getpid())
mem = virtual_memory()

In [None]:
pretrainedpath = gn_vec_path

#Load W2V model. This will take some time, but it is a one time effort!
pre = process.memory_info().rss
print("Memory used in GB before Loading the Model: %0.2f"%float(pre/(10**9))) #Check memory usage before loading the model
print('-'*10)

start_time = time.time() #Start the timer
ttl = mem.total #Toal memory available

w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True) #load the model
print("%0.2f seconds taken to load"%float(time.time() - start_time)) #Calculate the total time elapsed since starting the timer
print('-'*10)

print('Finished loading Word2Vec')
print('-'*10)

post = process.memory_info().rss
print("Memory used in GB after Loading the Model: {:.2f}".format(float(post/(10**9)))) #Calculate the memory used after loading the model
print('-'*10)

print("Percentage increase in memory usage: {:.2f}% ".format(float((post/pre)*100))) #Percentage increase in memory after loading the model
print('-'*10)

print("Numver of words in vocablulary: ",len(w2v_model)) #Number of words in the vocabulary.

Memory used in GB before Loading the Model: 0.16
----------
62.00 seconds taken to load
----------
Finished loading Word2Vec
----------
Memory used in GB after Loading the Model: 4.28
----------
Percentage increase in memory usage: 2636.88% 
----------
Numver of words in vocablulary:  3000000


In [None]:
#Let us examine the model by knowing what the most similar words are, for a given word!
w2v_model.most_similar('beautiful')

[('gorgeous', 0.8353005051612854),
 ('lovely', 0.8106936812400818),
 ('stunningly_beautiful', 0.7329413294792175),
 ('breathtakingly_beautiful', 0.7231340408325195),
 ('wonderful', 0.6854086518287659),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591402888298035)]

In [None]:
#index of each word
rock_idx = w2v_model.key_to_index["montreal"]
print(rock_idx)

540171


In [None]:
#Let us try with another word!
w2v_model.most_similar('toronto')

[('montreal', 0.6984112858772278),
 ('vancouver', 0.6587257385253906),
 ('nyc', 0.6248832941055298),
 ('alberta', 0.6179691553115845),
 ('boston', 0.611499547958374),
 ('calgary', 0.61032634973526),
 ('edmonton', 0.6100260615348816),
 ('canadian', 0.5944076776504517),
 ('chicago', 0.5911980271339417),
 ('springfield', 0.5888351798057556)]

In [None]:
#What is the vector representation for a word?
w2v_model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [None]:
#What if I am looking for a word that is not in this vocabulary?
#w2v_model['practicalnlp']

## - Getting the embedding representation for full text

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy

%time
nlp = spacy.load('en_core_web_md')
# process a sentence using the model
mydoc = nlp("Canada is a large country which i like to test")
# Get a vector for individual words
#print(mydoc[0].vector) #vector for 'Canada', the first word in the text
#print(mydoc.vector) #Averaged vector for the entire sentence
print(len(mydoc.vector))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
300


In [None]:
#What happens when I give a sentence with strange words (and stop words), and try to get its word vector in Spacy?
temp = nlp('practicalnlp is a newword')
temp[0].vector

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

Well, at least, this is better than throwing an exception! :)

# Strating our Textrank model

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

In [None]:
text = "The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.\nRepair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.\nTrains on the west coast mainline face disruption due to damage at the Lamington Viaduct.\nMany businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.\nFirst Minister Nicola Sturgeon visited the area to inspect the damage.\nThe waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.\nJeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.\nHowever, she said more preventative work could have been carried out to ensure the retaining wall did not fail.\n It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that."

In [None]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
from google.colab import files
sentences=sent_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
embedded_sentences = []
for sentence in sentences:
  temp_sent = nlp(sentence)
  temp_sent_embeddings = temp_sent.vector
  embedded_sentences.append(temp_sent_embeddings)

In [None]:
sentences_clean=[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
stop_words = stopwords.words('english')
sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
print(sentence_tokens)

[['full', 'cost', 'damage', 'newton', 'stewart', 'one', 'areas', 'worst', 'affected', 'still', 'assessed'], ['repair', 'work', 'ongoing', 'hawick', 'many', 'roads', 'peeblesshire', 'remain', 'badly', 'affected', 'standing', 'water'], ['trains', 'west', 'coast', 'mainline', 'face', 'disruption', 'due', 'damage', 'lamington', 'viaduct'], ['many', 'businesses', 'householders', 'affected', 'flooding', 'newton', 'stewart', 'river', 'cree', 'overflowed', 'town'], ['first', 'minister', 'nicola', 'sturgeon', 'visited', 'area', 'inspect', 'damage'], ['waters', 'breached', 'retaining', 'wall', 'flooding', 'many', 'commercial', 'properties', 'victoria', 'street', '', 'main', 'shopping', 'thoroughfare'], ['jeanette', 'tate', 'owns', 'cinnamon', 'cafe', 'badly', 'affected', 'said', 'could', 'fault', 'multiagency', 'response', 'flood', 'hit'], ['however', 'said', 'preventative', 'work', 'could', 'carried', 'ensure', 'retaining', 'wall', 'fail'], ['difficult', 'think', 'much', 'publicity', 'dumfries'

In [None]:
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
for i,row_embedding in enumerate(embedded_sentences):
    for j,column_embedding in enumerate(embedded_sentences):
        similarity_matrix[i][j]=abs(1-spatial.distance.cosine(row_embedding,column_embedding))

In [None]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [None]:
top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:4])

In [None]:
for sent in sentences:
    if sent in top.keys():
        print(sent+'\n')
        with open('TextRanked.txt', 'a') as f:
          f.write(sent)

The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.

Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.

The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.

Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.



# Using our model in big scale

loading the xsum dataset

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("xsum")

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/519.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.0 MB/s

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [None]:
text_column = dataset['test']
print(text_column)

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})


starting to use the algorithm...

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx
nltk.download('punkt')
nltk.download('stopwords')
from google.colab import files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/XSum Dataset/Pretrained Word2vec"

In [None]:
df = pd.read_csv(r'/content/drive/My Drive/XSum Dataset/xsum_test.csv')
df_1 = df[:1000]
df_2 = df[1000:2000]
df_3 = df[2000:3000]
df_4 = df[3000:4000]
df_5 = df[4000:5000]
df_6 = df[5000:6000]
df_7 = df[6000:7000]
df_8 = df[7000:8000]
df_9 = df[8000:9000]
df_10 = df[9000:10000]
df_11 = df[10000:11000]
df_12 = df[11000:]
df_3

Unnamed: 0,document,summary,id
2000,Gary Price was suspended from all council duti...,A decision to suspend a Powys county councillo...,39224291
2001,"The cinema chain has ""provided the British pub...",Curzon will receive an outstanding British con...,38649398
2002,The Iron are currently sixth in the National L...,Braintree manager Danny Cowley said the task a...,35977985
2003,The Slovak showed great determination to beat ...,Debutante Dominika Cibulkova will meet world n...,37809638
2004,Downing Street backed a report by think tank P...,Councils should consider selling off their mos...,19311364
...,...,...,...
2995,Fellow scrum-half Greig Laidlaw was Scotland's...,Henry Pyrgos is ready to make the most of what...,34359802
2996,The BBC's Ian Pannell said the jihadists were ...,As Iraqi special forces enter the city of Mosu...,37833661
2997,On Tuesday Mr Obama cancelled planned talks af...,US President Barack Obama and Philippine leade...,37303922
2998,"In fact, it's actually 21-year-old student Mor...",This is the Taylor Swift lookalike that even t...,32055721


In [None]:
test_column = df_2['document']
file_directory = '/content/drive/My Drive/XSum Dataset/Pretrained Word2vec/xsum_test_textranked_w2v_2.csv'

In [None]:
n=999
for x in test_column:
  n = n + 1
  text = x
  if pd.isnull(df_2.loc[n, 'document']):
    continue

  #Tokenizing the text
  sentences=sent_tokenize(text)

  #Removing Stopwords
  sentences_clean=[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
  stop_words = stopwords.words('english')
  sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]

  #Embedding with word2vec
  embedded_sentences = []
  for sentence in sentences:
    temp_sent = nlp(sentence)
    temp_sent_embeddings = temp_sent.vector
    embedded_sentences.append(temp_sent_embeddings)

  #Calculating Simularity using Cosine Simularity
  similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
  for i,row_embedding in enumerate(embedded_sentences):
    for j,column_embedding in enumerate(embedded_sentences):
      similarity_matrix[i][j]=abs(1-spatial.distance.cosine(row_embedding,column_embedding))

  #Creating Graph
  nx_graph = nx.from_numpy_array(similarity_matrix)
  scores = nx.pagerank(nx_graph)
  top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
  top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:4])
  print("<<<text " + str(n+1) + " summarized>>>")
  temp_summary_list = []
  for sent in sentences:
    if sent in top.keys():
      temp_summary_list.append(sent)
  summary=str(temp_summary_list)
  summary=summary.replace("[","")
  summary=summary.replace("]","")
  summary=summary.replace(","," ")
  summary=summary.replace("\'","")
  df_2.loc[n, 'textrank'] = summary

df_2.to_csv(file_directory)

df_2.head()
