In [1]:
import numpy as np
import rouge
import re
import difflib
from summa import summarizer
from pdfminer.high_level import *

In [2]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def utils_split_sentences(a, b):
    match = difflib.SequenceMatcher(isjunk=None, a=a, b=b, autojunk=True)
    lst_match = [block for block in match.get_matching_blocks() if block.size > 20]
    
    if len(lst_match) == 0:
        lst_a, lst_b = nltk.sent_tokenize(a), nltk.sent_tokenize(b)
    
    else:
        first_m, last_m = lst_match[0], lst_match[-1]

        string = a[0 : first_m.a]
        lst_a = [t for t in nltk.sent_tokenize(string)]
        for n in range(len(lst_match)):
            m = lst_match[n]
            string = a[m.a : m.a+m.size]
            lst_a.append(string)
            if n+1 < len(lst_match):
                next_m = lst_match[n+1]
                string = a[m.a+m.size : next_m.a]
                lst_a = lst_a + [t for t in nltk.sent_tokenize(string)]
            else:
                break
        string = a[last_m.a+last_m.size :]
        lst_a = lst_a + [t for t in nltk.sent_tokenize(string)]

        string = b[0 : first_m.b]
        lst_b = [t for t in nltk.sent_tokenize(string)]
        for n in range(len(lst_match)):
            m = lst_match[n]
            string = b[m.b : m.b+m.size]
            lst_b.append(string)
            if n+1 < len(lst_match):
                next_m = lst_match[n+1]
                string = b[m.b+m.size : next_m.b]
                lst_b = lst_b + [t for t in nltk.sent_tokenize(string)]
            else:
                break
        string = b[last_m.b+last_m.size :]
        lst_b = lst_b + [t for t in nltk.sent_tokenize(string)]
    
    return lst_a, lst_b

In [5]:
def display_string_matching(a, b, both=True, sentences=True, titles=[]):
    if sentences is True:
        lst_a, lst_b = utils_split_sentences(a, b)
    else:
        lst_a, lst_b = a.split(), b.split()       
    
    ## highlight a
    first_text = []
    for i in lst_a:
        if re.sub(r'[^\w\s]', '', i.lower()) in [re.sub(r'[^\w\s]', '', z.lower()) for z in lst_b]:
            first_text.append('<span style="background-color:rgba(255,215,0,0.3);">' + i + '</span>')
        else:
            first_text.append(i)
    first_text = ' '.join(first_text)
    
    ## highlight b
    second_text = []
    if both is True:
        for i in lst_b:
            if re.sub(r'[^\w\s]', '', i.lower()) in [re.sub(r'[^\w\s]', '', z.lower()) for z in lst_a]:
                second_text.append('<span style="background-color:rgba(255,215,0,0.3);">' + i + '</span>')
            else:
                second_text.append(i)
    else:
        second_text.append(b) 
    second_text = ' '.join(second_text)
    
    ## concatenate
    if len(titles) > 0:
        first_text = "<strong>"+titles[0]+"</strong><br>"+first_text
    if len(titles) > 1:
        second_text = "<strong>"+titles[1]+"</strong><br>"+second_text
    else:
        second_text = "---"*65+"<br><br>"+second_text
    final_text = first_text +'<br><br>'+ second_text
    return final_text

In [6]:
def evaluate_summary(y_test, predicted):
    rouge_score = rouge.Rouge()
    scores = rouge_score.get_scores(y_test, predicted, avg=True)
    score_1 = round(scores['rouge-1']['f'], 2)
    score_2 = round(scores['rouge-2']['f'], 2)
    score_L = round(scores['rouge-l']['f'], 2)
    print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:", score_2, 
          "--> avg rouge:", round(np.mean([score_1,score_2,score_L]), 2))

BASELINE (Extractive Summarization : TextRank)

In [7]:
def textrank(corpus, ratio=0.2, words=20):
    if isinstance(corpus, str):
        corpus = [corpus]
    
    lst_summaries = [summarizer.summarize(txt, ratio=ratio, words=words) for txt in corpus]
    return lst_summaries

In [8]:
def extract_text_from_pdf(file_path):
    text = extract_text(file_path)
    return text

# the headings should be capital
def isTitle(st):
    return st.upper() == st and not st.isnumeric()

# returns the list of (title, para list)
def parse(tex):
    ret = []
    tex += '\n\nEND'
    curr_title = 'START'
    curr_lis = []
    
    for i in re.split('\n\n', text):
        if(isTitle(i) and not (curr_title.endswith('REFERENCES') or curr_title.endswith('BIBLIOGRAPHY'))):
            ret.append((curr_title, curr_lis))
            curr_title = i.strip()
            curr_lis = []
        else:
            i = re.sub('\[\d+\]', '', i)
            if(not i.isnumeric()):
                curr_lis.append(i.strip())
    
    return ret

In [9]:
text = extract_text_from_pdf('Eye_Movement_Tracking_for_Computer_Vision_Syndrome_using_Deep_Learning_Techniques.pdf')

In [11]:
for heading, content in parse(text):
    print('\n\n'+heading+'\n\n')
    print(content)



START


['Eye Movement Tracking for Computer Vision Syndrome using\nDeep Learning Techniques', 'Manan Popat1, Divyan Goyal1, Vibhum Raj1, Nirmal Jayabalan2, and Chittaranjan Hota1', '1Department of Computer Science, BITS Pilani, Hyderabad Campus, India\n2Department of Pharmacy, BITS Pilani, Hyderabad Campus, India\n{f20200029, f20200042, f20200247, nirmalj, hota}@hyderabad.bits-pilani.ac.in', 'Abstract— Due to the increased usage of digital devices in\ndaily life, particularly among children, symptoms such as\ndrying of the eyes, eye strain, headaches, blurred vision, etc.,\nhave become recurrent nowadays. Extensive use of computers\nand smartphones may lead to a common eye-related condition\nknown as Computer Vision Syndrome (CVS). It is often char-\nacterized by a reduced blinking rate of the user. In this paper,\nwe propose a deep neural network and computer vision-based\nmachine learning model that entails training a Convolutional\nNeural Network (CNN) to detect eye blinks, and m

In [16]:
summary = ""
for [heading, content] in parse(text):
    summary += '\n\n'+heading+'\n\n'
    for para in content:
        predicted = textrank(corpus=para, ratio=0.2, words=50)
        print(predicted)

['Eye Movement Tracking for Computer Vision Syndrome using\nDeep Learning Techniques']
['']
['1Department of Computer Science, BITS Pilani, Hyderabad Campus, India\n2Department of Pharmacy, BITS Pilani, Hyderabad Campus, India\n{f20200029, f20200042, f20200247, nirmalj, hota}@hyderabad.bits-pilani.ac.in']
['drying of the eyes, eye strain, headaches, blurred vision, etc.,\nNeural Network (CNN) to detect eye blinks, and monitoring\nblink rates with a Long Short-Term Memory (LSTM) network.\nrate and eye movement patterns have also been identified.\nhas been trained on the Closed Eyes in the Wild (CEW) dataset.']
['Computer Vision Syndrome (CVS) is a term that\nand eye-related difficulties caused by the increased usage\nas “digital eye strain”, causes problems such as dry eyes,\nto monitor the blinking rate, which is a vital sign of CVS.\nExisting research in eye blink detection for CVS has']
['as it required setting distinct threshold values for each image,\npotentially leading to noise-i