# Importing Libraries

*numpy* - for managing arrays

*scipy.stats* for pearson score(which can be implemented in code too)

*bs4, requests* - scrapping the page whose summary is required

*nltk* - for tokenizing and finding stopwords

In [2]:
import numpy as np
from scipy.stats import pearsonr
import requests
from bs4 import BeautifulSoup

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from copy import deepcopy

stop_words = stopwords.words('english')
recquired_length = 0.25
total_length = 0

# Function for loading word embeddings

**Chnage the veriable *vectors_location* to the location where you downloaded the file mentioned in README.md**

This function reads the vectors and stores them in a dictionary with the word as the key

In [4]:
word_embeddings = {}
def load_word_embeddings():
    vectors_location = "/media/tatan/A0A04E3AA04E16E6/word_vectors/glove.6B.100d.txt"
    f = open(vectors_location,encoding = 'utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()

**Calling the function to load the embeddings**

In [6]:
load_word_embeddings()

# Loading the Webpage text in variable *text*

In [7]:
html = requests.get("https://towardsdatascience.com/how-to-build-a-data-science-portfolio-5f566517c79c").text
soup = BeautifulSoup(html,'html.parser')

title = soup.find('title').text
paras = soup.findAll('p')
text = [i.text for i in paras if len(i.text)>70]
text = "".join(text)

# Creating sentence tokens

These will be later evaluated, if they are needed in the summary

In [8]:
sentence_tokens = sent_tokenize(text)
total_length = len(sentence_tokens)

# Function for extracting Word vectors from the *sentence_tokens*

In [9]:
def get_word_vectors(sentence):
    sentence = remove_stopwords_and_punctuations(sentence)
    word_vectors = []
    for word in sentence.split():
        try:
            word_vectors.append(deepcopy(word_embeddings[word.lower()]))
        except KeyError as e:
            print(e)
    return word_vectors

# Function for averaging all the values in the vectors axis-wise

In [10]:
def sentencevector_avg(word_vectors):
    avg_vector = []
    if len(word_vectors)!=0:
        avg_vector = np.array([sum([vector[axis] for vector in word_vectors])/len(word_vectors) for axis in range(len(word_vectors[0]))])
    else:
        avg_vector = np.zeros((100,))
    return avg_vector

# Function for averaging all the values in the vectors axis-wise(weigheted)

In [11]:
def sentencevector_avg_weighted(word_vectors,tmp_vectors,weight):
    for i in range(len(tmp_vectors)):
        for j in range(len(tmp_vectors[0])):
            tmp_vectors[i][j] = weight*tmp_vectors[i][j]
    word_vectors+=tmp_vectors
    avg_vector = []
    if len(word_vectors)!=0:
        avg_vector = np.array([sum([vector[axis] for vector in word_vectors])/len(word_vectors) for axis in range(len(word_vectors[0]))])
    else:
        avg_vector = np.zeros((100,))
    return avg_vector

# Function name self explanatory

In [12]:
def remove_stopwords_and_punctuations(sen):
    tokenizer = RegexpTokenizer(r'\w+')
    sen = " ".join(tokenizer.tokenize(sen))
    new_vec = " ".join([word for word in sen.split() if word not in stop_words])
    return new_vec

# Main part of the code

For each token in the summary it averages the word vectors them it compares it to the title.

If the pearson score of the sentence vector is grater than 0.1(to rule out any unnessacary introductory lines) and the average  of the vector calculated uptil now, it is added to the summary

In [29]:
def getSummary(title,sentence_tokens):
    summary = []
    sumoftmps = 9
    avg = 0
    ctr = 1
    title_vector = sentencevector_avg(get_word_vectors(title))
    for sentence_token in sentence_tokens:
        sentence_vector = []

        sentence_vector = sentencevector_avg(get_word_vectors(sentence_token))
        
        tmp = pearsonr(sentence_vector,title_vector)[0]
        sumoftmps+=tmp
        avg = sumoftmps/ctr
        ctr+=1
        if(tmp>=avg and tmp>0.1):
            summary.append((tmp,sentence_token))
            title_vector = sentencevector_avg_weighted(get_word_vectors(title),get_word_vectors(sentence_token),0.5)
            
    return summary

# Running the code

In [None]:
summary = [(i,s) for i,s in enumerate(getSummary(title,sentence_tokens))]
summary.sort(key = lambda x: x[1][0], reverse = True)

final_summary = sorted(summary[:int(recquired_length*total_length)],key = lambda x: x[0])

summary = ""
for i in final_summary:
    summary+=i[1][1]+"\n"
print(summary,len(summary))