In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#  Import essential packages 


import numpy as np
import pandas as pd
import os
import pickle
import re
from sklearn.datasets import load_files
import glob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tqdm.auto import tqdm

tqdm.pandas()

In [3]:
# Implementing pretrained word embeddings



from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


glove_file = datapath(os.path.abspath('glove.6B.50d.txt'))
tmp_file = get_tmpfile(os.path.abspath("test_word2vec.txt"))
converted_file = glove2word2vec(glove_file, tmp_file) 




In [4]:
# Loading the Glove embeddings in word2vec format 


glove_model = KeyedVectors.load_word2vec_format(os.path.abspath("test_word2vec.txt"))

In [5]:
glove_model["the"]  

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [6]:
# Stop word removal function


from spacy.lang.en.stop_words import STOP_WORDS

def stop_word_remove(sentence):
    temp = [token for token in sentence.split() if token not in STOP_WORDS]
    return ' '.join(word for word in temp)

In [7]:
# Function to read the papers from their paths 


def read_paper(path):
  f = open(path, 'r', encoding="utf-8")
  text = str(f.read())
  f.close()
  return text

In [8]:
def process_paper(text):

  # Removes unwanted characters, accounting for unicode characters
    text = re.sub("@&#", " ", text)
    text = re.sub("\n", " ", text)
    text = (text.encode('ascii', 'ignore')).decode("utf-8")

    # Extracting the highlights, body from the paper
    highlights = re.findall(r'HIGHLIGHTS(.*?)KEYPHRASES', text,  flags = re.I)[0]
    body_main = re.findall(r'(?<=\bABSTRACT\b).*', text, flags=re.I)[0]

    body = ' '.join(body_main.split())
    body = body.split('.')
  
    # Removing sentences that are too short or too long, as they wouldn't make apt summary text
    for i,x in enumerate(body):
        if (len(x.split())) < 3 or (len(x.split())) > 15: 
            body.pop(i)
    
    # Making a copy of the body, lowercasing body text, removing punctuations & extra spaces
    dummy_body = []
    for i in body:
        i= i.lower()
        i = re.sub('[^\w\s\d\.]','',i)   ###[^\w\s\d\.] '.'keep here
        dummy_body.append(i)

    # Making a copy of the highlights, lowercasing body text, removing punctuations & extra spaces
    dummy_highlights = highlights.lower()
    dummy_highlights = re.sub('[^\w\s\d]','',dummy_highlights) ###[^\w\s\d] '.' remove here
    dummy_highlights = ' '.join(dummy_highlights.split())
    
    # Removing stop words from body & highlights
    body_copy = []
    for x in dummy_body:
        body_copy.append(stop_word_remove(x))  # call to stop word function
        
    highlight_copy = []
    for x in dummy_highlights.split():
        highlight_copy.append(stop_word_remove(x)) # call to stop word function

    # Combing all of the highlights into one string    
    highlight_copy = " ".join(sentence for sentence in highlight_copy)
    highlight_copy = " ".join(highlight_copy.split())

    return body_main, body_copy, highlights, highlight_copy,body

In [9]:

def create_body_copy_main(path):
  text = read_paper(path)
  body_main, body_copy, highlights, highlight_copy, body = process_paper(text)
  return body_copy

In [10]:
Body_Copy_Main = []

paths = glob.glob("Parsed_Papers/*.txt")
for i,path in enumerate(tqdm(paths[0:20])):
    body_copy = create_body_copy_main(path)
    Body_Copy_Main.extend(body_copy)    

  0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(Body_Copy_Main)

In [13]:
from scipy.special import expit


# Function to calculate sentence Score
def document_score(body_copy, highlight_copy):
  # Getting word vectors for the body
  body_vectors = [] 
  for sent in body_copy:
      sent_vec = [] 
      for word in sent.split():
          if word in glove_model:
              sent_vec.append(glove_model[word])
          # If the word vector isn't there in the model
          # then use the vector of the word "Visual"
          else:
              sent_vec.append(glove_model["visual"])
      body_vectors.append(sent_vec)

  # Getting word vectors for the highlights
  highlight_vectors = []
  for word in highlight_copy.split():
      if word in glove_model:
          highlight_vectors.append(glove_model[word])
      else:
          highlight_vectors.append(glove_model["visual"])

  # Finding the rouge score for each sentence by counting the # of common words
  # & dividing by length of sentence
  doc_score = []
  for sent in body_vectors:
      sent_score = 0
      for word in sent:
          for w in highlight_vectors:
              if (word == w).all():
                  sent_score+=1
      if sent_score>0: 
        doc_score.append(expit(sent_score/len(sent)))
      else:
        doc_score.append(0)
  return doc_score

In [14]:

def Vectorizer_Count(body_copy):
    bow_table = vectorizer.transform(body_copy)
    bow_table = bow_table.toarray()
    return bow_table

In [15]:
# Function to create datasets
def create_data(path):
  text = read_paper(path)
  body_main, body_copy, highlights, highlight_copy, body = process_paper(text)
  doc_score = document_score(body_copy, highlight_copy)
  bow_table = Vectorizer_Count(body_copy)
  x = bow_table       
  y = pd.Series(doc_score)  
  return x, y, body

In [16]:
# Import Stocastic Gradient Descent Regressor model


from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

In [17]:
# Create a Gaussian Classifier 


Model = GradientBoostingRegressor()

In [18]:
#  Creating a list of all file paths & partially fitting the model 

paths = glob.glob("Parsed_Papers/*.txt")
for i,path in enumerate(tqdm(paths[0:20])):
    x, y, body = create_data(path)         
    Model.fit(x,y)

  0%|          | 0/20 [00:00<?, ?it/s]

### Testing

In [25]:
# Test Data

x1, y1, body = create_data("Parsed_Papers/S0003687016300539.txt")


In [26]:
# --- Predicting the top 4 summary sentences --- 

c = Model.predict(x1)
lst = pd.Series(c)
i = lst.nlargest(4)
i = i.index.values.tolist()
i # Indices

[50, 9, 106, 80]

In [27]:
#  --- Predicted summary --- 

summary = []

for x in i:
    summary.append(body[x])

summary

[' on stairways, the observer was also instructed to keep an eye on the wayfinder by walking behind him/her',
 ' Route complexity is a growing problem in hospitals because hospitals are expanding in size due to the increasing demand for health care, more specialized care, and more diagnostic techniques',
 ' This might imply that participants have an incomplete representation of the spatial setting, and therefore rely on the central point wayfinding strategy, meaning that they first walk towards a central point like the main entry hall or main corridors (Hlsher etal',
 ' A route efficiency ratio larger than 1 indicates that participants walked more meters than strictly necessary']

In [28]:
Paper_Summary = ' '.join(summary)
Paper_Summary = ' '.join(Paper_Summary.split())
Paper_Summary = re.sub('[^\w\s\d\.]','',Paper_Summary)
Paper_Summary

'on stairways the observer was also instructed to keep an eye on the wayfinder by walking behind himher Route complexity is a growing problem in hospitals because hospitals are expanding in size due to the increasing demand for health care more specialized care and more diagnostic techniques This might imply that participants have an incomplete representation of the spatial setting and therefore rely on the central point wayfinding strategy meaning that they first walk towards a central point like the main entry hall or main corridors Hlsher etal A route efficiency ratio larger than 1 indicates that participants walked more meters than strictly necessary'

In [29]:
# --- Highlights - which are gold standard summary of the paper --- 


text = read_paper("Parsed_Papers/S0003687016300539.txt")
body_main,_, highlights,_,_ = process_paper(text)
highlights = " ".join(highlights.split()).split(".")
highlights

['Route complexity negatively influences wayfinding performance in terms of route efficiency and walking speed',
 ' Simulated elderly participants have higher heart rates and respiratory rates during a wayfinding task',
 ' Simulated physical ageing and route complexity do not interact on wayfinding performance and physiological outcomes',
 ' Physical ageing was simulated in an age-simulation field experiment by using gerontologic suits',
 ' A portable heart rate monitor was used to assess physiological outcomes like heart rate and respiratory rate',
 '']