### Extraction of title from pdf

In [None]:
# importing necessary libraries and packages
import fitz
import re
import nltk
import string
import PyPDF2

In [None]:
# Input pdf path
pdf_path = '1805.06130.pdf'

In [None]:
def extract_title(pdf_file):
    with open(pdf_file, 'rb') as pdf_file_obj:
        pdf_reader = PyPDF2.PdfReader(pdf_file_obj)

        # Attempt to extract the title from the first line of the text
        pdf_text = pdf_reader.pages[0].extract_text()
        line = pdf_text.split('\n')[:2]
        first_line = ' '.join(line).strip()

        # If the first line is not empty, use it as the title
        if first_line:
            pdf_title = first_line
        elif pdf_title == '':
            # If the first line is empty, use the PDF file name
            pdf_title = pdf_file.split('/')[-1]
        else:
            document_info = pdf_reader.metadata
            pdf_title = document_info.get('/Title')

        return pdf_title

In [None]:
title = extract_title(pdf_path)

In [None]:
title, type(title)

('Towards Robust Neural Machine Translation Yong Cheng?, Zhaopeng Tu?, Fandong Meng?, Junjie Zhai?and Yang Liuy',
 str)

In [None]:
# Getting the details of fontname and fontsize attribute for each line within pdfs

import fitz

def scrape(filePath):
    results = [] # list of tuples that store the information as (text, font size, font name)
    pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
    for page in pdf:
        dict = page.get_text("dict")
        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                            results.append((lines['text'], lines['size'], lines['font']))
                            # lines['text'] -> string, lines['size'] -> font size, lines['font'] -> font name
    pdf.close()
    return results

In [None]:
output = scrape(pdf_path)
output

[('Towards Robust Neural Machine Translation',
  14.346199989318848,
  'NimbusRomNo9L-Medi'),
 ('Yong Cheng', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('⋆', 7.970099925994873, 'CMMI8'),
 (', Zhaopeng Tu', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('⋆', 7.970099925994873, 'CMMI8'),
 (', Fandong Meng', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('⋆', 7.970099925994873, 'CMMI8'),
 (', Junjie Zhai', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('⋆', 7.970099925994873, 'CMMI8'),
 (' ', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('and Yang Liu', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('†', 7.970099925994873, 'CMSY8'),
 ('⋆', 7.970099925994873, 'CMMI8'),
 ('Tencent AI Lab, China', 11.9552001953125, 'NimbusRomNo9L-Regu'),
 ('†', 7.970099925994873, 'CMSY8'),
 ('State Key Laboratory of Intelligent Technology and Systems',
  11.9552001953125,
  'NimbusRomNo9L-Regu'),
 ('Beijing National Research Center for Information Science and Technology',
  11.9552001953125,
  'NimbusRomNo9L-Regu'),
 ('Departm

In [None]:
## Validation of correct title extraction
if title is None or title == '':
      new_title = ''
      max_font_size = max(output, key=lambda x: x[1])[1]
      elements_with_max_font = [element for element in output if element[1] == max_font_size]

      print("PDF title is:\n")
      for element in elements_with_max_font:
            new_title += ' ' + element[0]
      print(new_title)
else:
      new_title = title
      print(new_title)

Towards Robust Neural Machine Translation Yong Cheng?, Zhaopeng Tu?, Fandong Meng?, Junjie Zhai?and Yang Liuy


In [None]:
# packages required for text pre-processing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords

from nltk.tokenize import sent_tokenize, word_tokenize

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Tokenization, lower-case conversion and stop-word removal

def preprocess_text(text):

  sentences = sent_tokenize(text)  # Split into sentences
  tokens = [word_tokenize(sentence.lower()) for sentence in sentences]  # Tokenize each sentence
  preprocessed_tokens = []
  for sentence_tokens in tokens:
    filtered_tokens = [token for token in sentence_tokens if token not in stop_words]  # Remove stop words
    preprocessed_tokens.extend(filtered_tokens)
  return preprocessed_tokens

In [None]:
clean_title = preprocess_text(new_title)
print(clean_title)

['towards', 'robust', 'neural', 'machine', 'translation', 'yong', 'cheng', '?', ',', 'zhaopeng', 'tu', '?', ',', 'fandong', 'meng', '?', ',', 'junjie', 'zhai', '?', 'yang', 'liuy']


In [None]:
# Removing punctuations from title

import string

def remove_punctuations(text):

  punctuations = string.punctuation + "/_*"

  no_punct_text = " ".join([c for c in text if c not in punctuations])

  return no_punct_text

clean_title = remove_punctuations(clean_title)
print(f"Text without punctuations: {clean_title}")

Text without punctuations: towards robust neural machine translation yong cheng zhaopeng tu fandong meng junjie zhai yang liuy


### Extraction of text from other sections of pdf

##### Extracting section headers using fontsize and fontname attributes

In [None]:
# Determining the font size and font name of 'Abstract' using its index position

abstract_index = next(i for i, item in enumerate(output) if item[0] == "Abstract " or item[0] == 'ABSTRACT ' or item[0] == "Abstract" or item[0] == 'ABSTRACT'  or item[0] == "Abstract: ")

font_size = output[abstract_index][1]
font_name = output[abstract_index][2]
print(font_size, font_name)

11.9552001953125 NimbusRomNo9L-Medi


In [None]:
# Filtering out only those items which have similar properties as 'Abstract' and considering them as other section headers

filtered_items = [item for item in output if item[1] == font_size and item[2] == font_name]
filtered_items = filtered_items[:-1] # Excluding references from headers
filtered_items

[('Yong Cheng', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 (', Zhaopeng Tu', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 (', Fandong Meng', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 (', Junjie Zhai', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 (' ', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('and Yang Liu', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Abstract', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('1', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Introduction', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('2', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Background', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('3', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Approach', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('4', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Experiments', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('5', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Related Work', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('6', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Conclusion', 11.95520

In [None]:
# Removing unwanted parts of section headers (For eg. Author names, Acknowledgements, References etc.)

new_abstract_index = next((i for i, item in enumerate(filtered_items) if item[0] == 'Abstract 'or item[0] == 'ABSTRACT'
                           or item[0] == 'Abstract' or item[0] == 'ABSTRACT '), None)

new_items = filtered_items[new_abstract_index:] if new_abstract_index is not None else filtered_items

new_con_index = next((i for i, element in enumerate(new_items) if element[0] == 'CONCLUSIONS' or element[0] == 'CONCLUSIONS '
                      or element[0] == 'Conclusions' or element[0] == 'Conclusions ' or element[0] == 'Conclusion'
                      or element[0] == 'Conclusion '), None)

new_items = new_items[:new_con_index + 1]

new_items


[('Abstract', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('1', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Introduction', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('2', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Background', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('3', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Approach', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('4', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Experiments', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('5', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Related Work', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('6', 11.9552001953125, 'NimbusRomNo9L-Medi'),
 ('Conclusion', 11.9552001953125, 'NimbusRomNo9L-Medi')]

##### Formatting the headers to match the actual pdf headers

In [None]:
# Formatting the name of the headers to match the pattern given in original pdfs

output = []
current_element = ''

for i, item in enumerate(new_items):
    if i > 0 and item[0].isdigit() and not new_items[i - 1][0].isdigit():

        output.append(current_element.strip())
        current_element = item[0]
    else:
        current_element += ' ' + item[0]

# Add the last element to the output
if current_element:
    output.append(current_element.strip())

print(output)

['Abstract', '1 Introduction', '2 Background', '3 Approach', '4 Experiments', '5 Related Work', '6 Conclusion']


In [None]:
# Removing other subsections falling under 'Abstract' for less complication

new_output = []

if(len(output[0].split(' '))>1):
    words = output[0].split(' ')
    new_output.append(words[0])

else:
    new_output = output

new_output

['Abstract',
 '1 Introduction',
 '2 Background',
 '3 Approach',
 '4 Experiments',
 '5 Related Work',
 '6 Conclusion']

In [None]:
# Making a cleaner list of all section headers for individual pdfs

for i in range(1,len(output)):
  if(output != new_output):
    element = output[i].split('.')[0][:-1]
    element_new = ''
    if(output[i].split('.')[0][-1].isdigit()):
        element_new = element
    else:
        element_new = output[i].split('.')[0]

    new_output.append(element_new.rstrip())
  else:
     pass

new_output

['Abstract',
 '1 Introduction',
 '2 Background',
 '3 Approach',
 '4 Experiments',
 '5 Related Work',
 '6 Conclusion']

### Extract the entire text of pdf

In [None]:
# Parse the entire text from individual pdf

def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file and returns it as a string."""

    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        full_text = ""
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            full_text += page_text

    return full_text

In [None]:
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

Towards Robust Neural Machine Translation
Yong Cheng?, Zhaopeng Tu?, Fandong Meng?, Junjie Zhai?and Yang Liuy
?Tencent AI Lab, China
yState Key Laboratory of Intelligent Technology and Systems
Beijing National Research Center for Information Science and Technology
Department of Computer Science and Technology, Tsinghua University, Beijing, China
Beijing Advanced Innovation Center for Language Resources
chengyong3001@gmail.com
fzptu, fandongmeng, jasonzhai g@tencent.com
liuyang2011@tsinghua.edu.cn
Abstract
Small perturbations in the input can
severely distort intermediate representa-
tions and thus impact translation quality of
neural machine translation (NMT) mod-
els. In this paper, we propose to improve
the robustness of NMT models with adver-
sarial stability training. The basic idea is
to make both the encoder and decoder in
NMT models robust against input pertur-
bations by enabling them to behave sim-
ilarly for the original input and its per-
turbed counterpart. Experimental res

In [None]:
# Omitting 'References' section from extracted text
def remove_references(pdf_text):
    references_pattern = re.compile(r'References\s*[\r\n]+.*', re.DOTALL | re.IGNORECASE)

    text_without_references = re.sub(references_pattern, '', pdf_text)

    return text_without_references

In [None]:
extracted_text = remove_references(extracted_text)
extracted_text

'Towards Robust Neural Machine Translation\nYong Cheng?, Zhaopeng Tu?, Fandong Meng?, Junjie Zhai?and Yang Liuy\n?Tencent AI Lab, China\nyState Key Laboratory of Intelligent Technology and Systems\nBeijing National Research Center for Information Science and Technology\nDepartment of Computer Science and Technology, Tsinghua University, Beijing, China\nBeijing Advanced Innovation Center for Language Resources\nchengyong3001@gmail.com\nfzptu, fandongmeng, jasonzhai g@tencent.com\nliuyang2011@tsinghua.edu.cn\nAbstract\nSmall perturbations in the input can\nseverely distort intermediate representa-\ntions and thus impact translation quality of\nneural machine translation (NMT) mod-\nels. In this paper, we propose to improve\nthe robustness of NMT models with adver-\nsarial stability training. The basic idea is\nto make both the encoder and decoder in\nNMT models robust against input pertur-\nbations by enabling them to behave sim-\nilarly for the original input and its per-\nturbed counte

In [None]:
# Removing non-ascii characters, urls from extracted text

text_without_non_ascii = re.sub(r"[^\x00-\x7F]", "", extracted_text)
text_without_non_ascii = re.sub(r",.-/:","",text_without_non_ascii)
cleaned_text = re.sub(r"h/t_tps?://[^\s]+","",text_without_non_ascii)

print(cleaned_text)

Towards Robust Neural Machine Translation
Yong Cheng?, Zhaopeng Tu?, Fandong Meng?, Junjie Zhai?and Yang Liuy
?Tencent AI Lab, China
yState Key Laboratory of Intelligent Technology and Systems
Beijing National Research Center for Information Science and Technology
Department of Computer Science and Technology, Tsinghua University, Beijing, China
Beijing Advanced Innovation Center for Language Resources
chengyong3001@gmail.com
fzptu, fandongmeng, jasonzhai g@tencent.com
liuyang2011@tsinghua.edu.cn
Abstract
Small perturbations in the input can
severely distort intermediate representa-
tions and thus impact translation quality of
neural machine translation (NMT) mod-
els. In this paper, we propose to improve
the robustness of NMT models with adver-
sarial stability training. The basic idea is
to make both the encoder and decoder in
NMT models robust against input pertur-
bations by enabling them to behave sim-
ilarly for the original input and its per-
turbed counterpart. Experimental res

In [None]:
# Copying the header names within a new variable

sections = new_output
len(sections)

7

In [None]:
# Extraction of text under individual sections of pdfs using the header names

section_extraction = []

for i in range(len(sections)-1):
    start_index = cleaned_text.find(sections[i])
    end_index = cleaned_text.find(sections[i+1])

    if start_index != -1 and end_index != -1:
        extraction = cleaned_text[start_index:end_index].strip()
        print("Markers found in the text.",sections[i])
        section_extraction.append(extraction)
    else:
        print("Markers not found in the text.",sections[i])

# Extract the last section separately
last_start_index = cleaned_text.find(sections[-1])
if last_start_index != -1:
    last_extraction = cleaned_text[last_start_index:].strip()
    print("Markers found in the text.", sections[-1])
    section_extraction.append(last_extraction)
else:
    print("Markers not found in the text.", sections[-1])

Markers found in the text. Abstract
Markers found in the text. 1 Introduction
Markers found in the text. 2 Background
Markers found in the text. 3 Approach
Markers found in the text. 4 Experiments
Markers found in the text. 5 Related Work
Markers found in the text. 6 Conclusion


In [None]:
section_extraction

['Abstract\nSmall perturbations in the input can\nseverely distort intermediate representa-\ntions and thus impact translation quality of\nneural machine translation (NMT) mod-\nels. In this paper, we propose to improve\nthe robustness of NMT models with adver-\nsarial stability training. The basic idea is\nto make both the encoder and decoder in\nNMT models robust against input pertur-\nbations by enabling them to behave sim-\nilarly for the original input and its per-\nturbed counterpart. Experimental results\non Chinese-English, English-German and\nEnglish-French translation tasks show that\nour approaches can not only achieve sig-\nnicant improvements over strong NMT\nsystems but also improve the robustness of\nNMT models.',
 '1 Introduction\nNeural machine translation (NMT) models have\nadvanced the state of the art by building a sin-\ngle neural network that can better learn represen-\ntations (Cho et al., 2014; Sutskever et al., 2014).\nThe neural network consists of two compone

In [None]:
# Tokenization, Stop word removal from extracted pdf text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Process each elements in section_extraction

processed_text_final = [remove_stopwords(section) for section in section_extraction]
processed_text_final

['Abstract Small perturbations input severely distort intermediate representa- tions thus impact translation quality neural machine translation ( NMT ) mod- els . paper , propose improve robustness NMT models adver- sarial stability training . basic idea make encoder decoder NMT models robust input pertur- bations enabling behave sim- ilarly original input per- turbed counterpart . Experimental results Chinese-English , English-German English-French translation tasks show approaches achieve sig- nicant improvements strong NMT systems also improve robustness NMT models .',
 '1 Introduction Neural machine translation ( NMT ) models advanced state art building sin- gle neural network better learn represen- tations ( Cho et al. , 2014 ; Sutskever et al. , 2014 ) . neural network consists two components : encoder network encodes input sen- tence sequence distributed representa- tions , based decoder network generates translation attention model ( Bahdanau et al. , 2015 ; Luong et al. , 2015

### Applying LSA for summarizing the section extraction

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
# Vectorizing the extracted sentences
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(section_extraction)

# 3. Create a TruncatedSVD object for LSA
lsa = TruncatedSVD(n_components = 7)

In [None]:
# 4. Perform LSA on the TF-IDF matrix

lsa_matrix = lsa.fit_transform(tfidf_matrix)
print(lsa_matrix)

[[ 0.70876345 -0.28841418  0.42276639 -0.1745527  -0.40432219 -0.05196019
  -0.19772526]
 [ 0.85133022 -0.0324269  -0.03201699 -0.09140927 -0.17124069  0.20806354
   0.43839572]
 [ 0.74725656 -0.16071023 -0.45669857 -0.24428816  0.05612187 -0.3799713
   0.00130866]
 [ 0.80619702 -0.168069   -0.24879    -0.0310489   0.22860714  0.38013932
  -0.24934223]
 [ 0.70937023  0.04854717 -0.09776624  0.67183992 -0.15092214 -0.09784497
  -0.03404113]
 [ 0.59119517  0.77748725  0.01882944 -0.16720318 -0.08072237 -0.00237555
  -0.10568183]
 [ 0.70980384 -0.00911899  0.4616501   0.04420938  0.50844142 -0.12964386
   0.07549738]]


In [None]:
# Using LSA for summarizing the text of each individual section

# Calculate LSA scores for each sentence
lsa_scores = np.sum(np.abs(lsa_matrix), axis=1)

# Group sentences by section and rank within each group
grouped_sentences = {}
for sentence, section, score in zip(section_extraction, sections, lsa_scores):
    grouped_sentences.setdefault(section, []).append((score, sentence))

# Sentence tokenization function

def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Tokenize sentences for each section
section_sentences = {section: tokenize_sentences(' '.join(sentence[1] for sentence in sentences)) for section, sentences in grouped_sentences.items()}

# Choose the top-ranked sentences from each section
num_top_sentences = 5
top_ranked_sentences = []

for section, sentences in grouped_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence[1] for sentence in sentences[:num_top_sentences]]

    # Print the content of each sentence
    print(f"Section {section} sentences:\n", top_sentences)

Section Abstract sentences:
 ['Abstract\nSmall perturbations in the input can\nseverely distort intermediate representa-\ntions and thus impact translation quality of\nneural machine translation (NMT) mod-\nels. In this paper, we propose to improve\nthe robustness of NMT models with adver-\nsarial stability training. The basic idea is\nto make both the encoder and decoder in\nNMT models robust against input pertur-\nbations by enabling them to behave sim-\nilarly for the original input and its per-\nturbed counterpart. Experimental results\non Chinese-English, English-German and\nEnglish-French translation tasks show that\nour approaches can not only achieve sig-\nnicant improvements over strong NMT\nsystems but also improve the robustness of\nNMT models.']
Section 1 Introduction sentences:
 ['1 Introduction\nNeural machine translation (NMT) models have\nadvanced the state of the art by building a sin-\ngle neural network that can better learn represen-\ntations (Cho et al., 2014; Suts

In [None]:
# Creating a list to store the top-5 sentences of each section

# Iterate through each section and extract the top-ranked sentences
for section, sentences in section_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence for sentence in sentences[:num_top_sentences]]

    # Append the top sentences to the result list
    top_ranked_sentences.extend(top_sentences)

    print(f"Top-ranked sentences for Section {section}:\n" + ''.join(sentence + '\n' for sentence in top_sentences) + '\n')


Top-ranked sentences for Section Abstract:
The basic idea is
to make both the encoder and decoder in
NMT models robust against input pertur-
bations by enabling them to behave sim-
ilarly for the original input and its per-
turbed counterpart.
In this paper, we propose to improve
the robustness of NMT models with adver-
sarial stability training.
Experimental results
on Chinese-English, English-German and
English-French translation tasks show that
our approaches can not only achieve sig-
nicant improvements over strong NMT
systems but also improve the robustness of
NMT models.
Abstract
Small perturbations in the input can
severely distort intermediate representa-
tions and thus impact translation quality of
neural machine translation (NMT) mod-
els.


Top-ranked sentences for Section 1 Introduction:
We investigate severe variations of trans-
lations caused by small input perturbations by re-
placing one word in each sentence of a test set with
its synonym.
We observe that 69:74% of tra

In [None]:
## Creating a dictionary for stroing top-ranked sentences

top_ranked_sentences = {}

# Iterate through each section and extract the top-ranked sentences
for section, sentences in section_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence for sentence in sentences[:num_top_sentences]]

    # Store the top sentences in the dictionary
    top_ranked_sentences[section] = top_sentences

    # Print the top-ranked sentences for each section
    print(f"Top-ranked sentences for Section {section}:\n" + ''.join(sentence + '\n' for sentence in top_sentences) + '\n')

Top-ranked sentences for Section Abstract:
The basic idea is
to make both the encoder and decoder in
NMT models robust against input pertur-
bations by enabling them to behave sim-
ilarly for the original input and its per-
turbed counterpart.
In this paper, we propose to improve
the robustness of NMT models with adver-
sarial stability training.
Experimental results
on Chinese-English, English-German and
English-French translation tasks show that
our approaches can not only achieve sig-
nicant improvements over strong NMT
systems but also improve the robustness of
NMT models.
Abstract
Small perturbations in the input can
severely distort intermediate representa-
tions and thus impact translation quality of
neural machine translation (NMT) mod-
els.


Top-ranked sentences for Section 1 Introduction:
We investigate severe variations of trans-
lations caused by small input perturbations by re-
placing one word in each sentence of a test set with
its synonym.
We observe that 69:74% of tra

### Measuring the similarity score of each section summaries with user query

#### Using TF-IDF embedding

In [None]:
# Similarity score of query with each section containing entire text in that section before applying LSA

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
    similarity_score = text_similarity(section_extraction[sections.index(section)], text2)
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
    print(f"Similarity score with Section '{section}': {score}")

Similarity score with Section 'Abstract': 0.21969401455243595
Similarity score with Section '1 Introduction': 0.16707667138625515
Similarity score with Section '2 Background': 0.09953452035868789
Similarity score with Section '3 Approach': 0.027434434688052876
Similarity score with Section '4 Experiments': 0.07698533809506916
Similarity score with Section '5 Related Work': 0.04598916507931692
Similarity score with Section '6 Conclusion': 0.11917591430622483


In [None]:
# Similarity score of query with each section containing only top 5 ranked sentences in that section after applying LSA

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
    similarity_score = text_similarity(' '.join(top_ranked_sentences[section]), text2)
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
    print(f"Similarity score with Section after LSA'{section}': {score}")

Similarity score with Section after LSA'Abstract': 0.21969401455243598
Similarity score with Section after LSA'1 Introduction': 0.15099716057227913
Similarity score with Section after LSA'2 Background': 0.0586658554900754
Similarity score with Section after LSA'3 Approach': 0.0
Similarity score with Section after LSA'4 Experiments': 0.09901475429766744
Similarity score with Section after LSA'5 Related Work': 0.0816496580927726
Similarity score with Section after LSA'6 Conclusion': 0.0


In [None]:
# Getting average similarity score of the sections with entire text without LSA applied

# Example text 2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section header
similarity_scores = []
for section_text in section_extraction:
    similarity_score = text_similarity(section_text, text2)
    similarity_scores.append(similarity_score)

# Calculate the average similarity score
average_similarity = sum(similarity_scores) / len(similarity_scores)

print("Average Similarity Score:", average_similarity)

Average Similarity Score: 0.10798429406657754


In [None]:
# Getting average similarity score of section only with top 5 ranked sentences after LSA applied

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section header using top ranked sentences
similarity_scores = []
for section, top_sentences in top_ranked_sentences.items():
    section_text = ' '.join(top_sentences)
    similarity_score = text_similarity(section_text, text2)
    similarity_scores.append(similarity_score)

# Calculate the average similarity score
average_similarity = sum(similarity_scores) / len(similarity_scores)

print("Average Similarity Score with Top Ranked Sentences after LSA:", average_similarity)

Average Similarity Score with Top Ranked Sentences after LSA: 0.08714592042931865


In [None]:
!pip install prettytable



In [None]:
# Table for displaying the similarity score of each section with entire text

from prettytable import PrettyTable

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
    similarity_score = text_similarity(section_extraction[sections.index(section)], text2)
    similarity_scores[section] = similarity_score

# Display similarity scores in a table
table = PrettyTable()
table.field_names = ["Section with entire pdf text usinf TF-IDF", "Similarity Score"]

for section, score in similarity_scores.items():
    table.add_row([section, f"{score:.6f}"])

# Calculate and add the average similarity score to the table
average_similarity = sum(similarity_scores.values()) / len(similarity_scores)
table.add_row(["Average Similarity", f"{average_similarity:.6f}"])

# Print the table
print(table)

+-------------------------------------------+------------------+
| Section with entire pdf text usinf TF-IDF | Similarity Score |
+-------------------------------------------+------------------+
|                  Abstract                 |     0.219694     |
|               1 Introduction              |     0.167077     |
|                2 Background               |     0.099535     |
|                 3 Approach                |     0.027434     |
|               4 Experiments               |     0.076985     |
|               5 Related Work              |     0.045989     |
|                6 Conclusion               |     0.119176     |
|             Average Similarity            |     0.107984     |
+-------------------------------------------+------------------+


In [None]:
# Table for similarity score of sections with top ranked sentences of LSA

from prettytable import PrettyTable

# Example text 2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section using top-ranked sentences
similarity_scores_top_ranked = {}
for section, top_sentences in top_ranked_sentences.items():
    section_text = ' '.join(top_sentences)
    similarity_score = text_similarity(section_text, text2)
    similarity_scores_top_ranked[section] = similarity_score

# Display similarity scores in a table for top-ranked sentences
table_top_ranked = PrettyTable()
table_top_ranked.field_names = ["Section with LSA for TF-IDF", "Similarity Score"]

for section, score in similarity_scores_top_ranked.items():
    table_top_ranked.add_row([section, f"{score:.6f}"])

# Calculate and add the average similarity score to the table
average_similarity_top_ranked = sum(similarity_scores_top_ranked.values()) / len(similarity_scores_top_ranked)
table_top_ranked.add_row(["Average Similarity (Top Ranked)", f"{average_similarity_top_ranked:.6f}"])

# Print the table for top-ranked sentences
print(table_top_ranked)

+---------------------------------+------------------+
|   Section with LSA for TF-IDF   | Similarity Score |
+---------------------------------+------------------+
|             Abstract            |     0.219694     |
|          1 Introduction         |     0.150997     |
|           2 Background          |     0.058666     |
|            3 Approach           |     0.000000     |
|          4 Experiments          |     0.099015     |
|          5 Related Work         |     0.081650     |
|           6 Conclusion          |     0.000000     |
| Average Similarity (Top Ranked) |     0.087146     |
+---------------------------------+------------------+


### Evaluation Metric for validating the performance of LSA summarization

In [None]:
!pip install nltk py-rouge



In [None]:
!pip install rouge-score



In [None]:
# Evaluation of section summaries without LSA applied and the reference summary has been taken from ChatGPT

from nltk.translate import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Define reference summaries for each section
reference_summaries = {
    'Abstract': "introduces the challenge of input perturbations negatively affecting the intermediate representations and subsequently impacting the translation quality of Neural Machine Translation (NMT) models. The proposed solution in the paper is adversarial stability training, aimed at enhancing the robustness of NMT models. The core concept involves making both the encoder and decoder of NMT models resilient to input perturbations, ensuring similar behavior for both the original input and its perturbed counterpart. The experimental findings, conducted on Chinese-English, English-German, and English-French translation tasks, demonstrate that the proposed approaches not only outperform strong NMT systems but also enhance the overall robustness of NMT models.",
    '1 Introduction': "the progress of Neural Machine Translation (NMT) models in learning representations through a unified neural network. Despite their success, the paper addresses the non-robustness of NMT models, showcasing a critical problem through an example where replacing a word with its synonym leads to significant translation errors. The analogy of the butterfly effect is used to illustrate how small input changes can cause drastic alterations in the output. An investigation involving synonym replacements in a test set demonstrates that 69.74% of translations are altered, and the BLEU score drops to 79.01, emphasizing the models' sensitivity to small input perturbations. This sensitivity limits the applicability of NMT models to tasks requiring robust performance on noisy inputs.",
    '2 Background': "The provided text describes the standard Neural Machine Translation (NMT) framework, emphasizing its end-to-end nature that optimizes the translation probability from a source sentence (x) to a target sentence (y). The model includes an encoder, which converts the source sentence into hidden representations (Hx), and a decoder, generating target words based on these representations. The training objective involves minimizing the negative log-likelihood of the training corpus. However, due to the vulnerability of deep neural networks, small input perturbations significantly impact translation results. To address this, the paper proposes an adversarial stability training approach, aiming to enhance the robustness of encoder representations against noisy perturbations. The architecture involves a discriminator and perturbations to achieve this stability.",
    '3 Approach': "The proposed approach focuses on maintaining consistent behaviors in the NMT model for both the source sentence (x) and its perturbed counterpart (x0). The encoder and decoder are trained to be perturbation-invariant, ensuring that small changes in input do not significantly affect the translation output. The architecture involves constructing perturbed sentences from the source sentence and introducing two additional objectives: Linv encourages the encoder to output similar representations for x and x0, while Lnoisy guides the decoder to generate output y given the noisy input x0. The training objective combines these objectives with the original translation task, promoting both stability and good translation performance.",
    '4 Experiments': "The evaluation of adversarial stability training was conducted on translation tasks for various language pairs, with 4-gram BLEU scores reported. For Chinese-English, the LDC corpus with 1.25M sentence pairs was used, and the NIST datasets served as test sets. English-German utilized the WMT 14 corpus, and English-French employed the IWSLT corpus, collected from TED talks, to assess non-normative text. The baseline system was an in-house NMT model with a two-layer RNN architecture, GRU gating mechanism, layer normalization, and dropout. Adversarial stability training involved lexical-level and feature-level perturbations, denoted as ASTlexical and ASTfeature. ",
    '5 Related Work': "In the realm of adversarial learning, the influence of Generative Adversarial Networks (GAN) and its derivatives has been extensive in computer vision and natural language processing. While prior work has used adversarial examples to attack and defend networks, the proposed adversarial stability training distinctively aims to stabilize both the encoder and decoder in NMT models. This approach utilizes adversarial learning to achieve a perturbation-invariant encoder. In the context of data augmentation, numerous methods have sought to enhance the robustness of NMT models by augmenting training data with monolingual corpora.",
    '6 Conclusion': "The proposed adversarial stability training aims to enhance the robustness of NMT models by training both the encoder and decoder to handle input perturbations consistently. Two approaches for constructing perturbed data are introduced to adversarially train the encoder and stabilize the decoder. Experimental results on Chinese-English, English-German, and English-French translation tasks demonstrate that the proposed approach effectively improves both robustness and translation performance. To broaden the applicability, further evaluations are suggested in the context of natural noise present in practical applications, such as homonyms in simultaneous translation systems."
}
def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  # Define smoothing function
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

    meteor_score_value = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_value, rouge1, rouge2, rougel

# Initialize generated summary with the content of each section in top-ranked sentences
generated_summary = ""

# Calculate scores for each section using top-ranked sentences
bleu_scores_top_ranked = {}
meteor_scores_top_ranked = {}
rouge1_scores_top_ranked = {}
rouge2_scores_top_ranked = {}
rougel_scores_top_ranked = {}

for section, top_sentences in section_sentences.items():
    section_text = ' '.join(top_sentences)
    generated_summary += section_text + " "  # Include the content of each section

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    bleu_scores_top_ranked[section] = bleu_score
    meteor_scores_top_ranked[section] = meteor_score_val
    rouge1_scores_top_ranked[section] = rouge1
    rouge2_scores_top_ranked[section] = rouge2
    rougel_scores_top_ranked[section] = rougel

# Display scores for each section using top-ranked sentences
for section in section_sentences.keys():
    print(f"Scores for Section '{section}' (Sections have the entire text in the generated summary):")
    print("BLEU Score:", bleu_scores_top_ranked[section])
    print("METEOR Score:", meteor_scores_top_ranked[section])
    print("ROUGE-1 F-measure:", rouge1_scores_top_ranked[section])
    print("ROUGE-2 F-measure:", rouge2_scores_top_ranked[section])
    print("ROUGE-L F-measure:", rougel_scores_top_ranked[section])
    print()

# Display the generated summary
print("\nGenerated Summary with Content of Top-Ranked Sentences:")
print(generated_summary)


Scores for Section 'Abstract' (Sections have the entire text in the generated summary):
BLEU Score: 0.24203006556309162
METEOR Score: 0.5629651422531682
ROUGE-1 F-measure: 0.6232558139534884
ROUGE-2 F-measure: 0.36619718309859156
ROUGE-L F-measure: 0.5674418604651164

Scores for Section '1 Introduction' (Sections have the entire text in the generated summary):
BLEU Score: 0.024943677360793858
METEOR Score: 0.28088956765191936
ROUGE-1 F-measure: 0.2035010940919037
ROUGE-2 F-measure: 0.08552631578947367
ROUGE-L F-measure: 0.13566739606126915

Scores for Section '2 Background' (Sections have the entire text in the generated summary):
BLEU Score: 0.07234728215674198
METEOR Score: 0.4060530443860149
ROUGE-1 F-measure: 0.38229376257545267
ROUGE-2 F-measure: 0.1898989898989899
ROUGE-L F-measure: 0.2776659959758551

Scores for Section '3 Approach' (Sections have the entire text in the generated summary):
BLEU Score: 0.021352657188482067
METEOR Score: 0.20770392749244715
ROUGE-1 F-measure: 0.13

In [None]:
# Evaluation of LSA applied summaries with reference summary of ChatGPT

# Define reference summaries for each section
reference_summaries = {
    'Abstract': "introduces the challenge of input perturbations negatively affecting the intermediate representations and subsequently impacting the translation quality of Neural Machine Translation (NMT) models. The proposed solution in the paper is adversarial stability training, aimed at enhancing the robustness of NMT models. The core concept involves making both the encoder and decoder of NMT models resilient to input perturbations, ensuring similar behavior for both the original input and its perturbed counterpart. The experimental findings, conducted on Chinese-English, English-German, and English-French translation tasks, demonstrate that the proposed approaches not only outperform strong NMT systems but also enhance the overall robustness of NMT models.",
    '1 Introduction': "the progress of Neural Machine Translation (NMT) models in learning representations through a unified neural network. Despite their success, the paper addresses the non-robustness of NMT models, showcasing a critical problem through an example where replacing a word with its synonym leads to significant translation errors. The analogy of the butterfly effect is used to illustrate how small input changes can cause drastic alterations in the output. An investigation involving synonym replacements in a test set demonstrates that 69.74% of translations are altered, and the BLEU score drops to 79.01, emphasizing the models' sensitivity to small input perturbations. This sensitivity limits the applicability of NMT models to tasks requiring robust performance on noisy inputs.",
    '2 Background': "The provided text describes the standard Neural Machine Translation (NMT) framework, emphasizing its end-to-end nature that optimizes the translation probability from a source sentence (x) to a target sentence (y). The model includes an encoder, which converts the source sentence into hidden representations (Hx), and a decoder, generating target words based on these representations. The training objective involves minimizing the negative log-likelihood of the training corpus. However, due to the vulnerability of deep neural networks, small input perturbations significantly impact translation results. To address this, the paper proposes an adversarial stability training approach, aiming to enhance the robustness of encoder representations against noisy perturbations. The architecture involves a discriminator and perturbations to achieve this stability.",
    '3 Approach': "The proposed approach focuses on maintaining consistent behaviors in the NMT model for both the source sentence (x) and its perturbed counterpart (x0). The encoder and decoder are trained to be perturbation-invariant, ensuring that small changes in input do not significantly affect the translation output. The architecture involves constructing perturbed sentences from the source sentence and introducing two additional objectives: Linv encourages the encoder to output similar representations for x and x0, while Lnoisy guides the decoder to generate output y given the noisy input x0. The training objective combines these objectives with the original translation task, promoting both stability and good translation performance.",
    '4 Experiments': "The evaluation of adversarial stability training was conducted on translation tasks for various language pairs, with 4-gram BLEU scores reported. For Chinese-English, the LDC corpus with 1.25M sentence pairs was used, and the NIST datasets served as test sets. English-German utilized the WMT 14 corpus, and English-French employed the IWSLT corpus, collected from TED talks, to assess non-normative text. The baseline system was an in-house NMT model with a two-layer RNN architecture, GRU gating mechanism, layer normalization, and dropout. Adversarial stability training involved lexical-level and feature-level perturbations, denoted as ASTlexical and ASTfeature. ",
    '5 Related Work': "In the realm of adversarial learning, the influence of Generative Adversarial Networks (GAN) and its derivatives has been extensive in computer vision and natural language processing. While prior work has used adversarial examples to attack and defend networks, the proposed adversarial stability training distinctively aims to stabilize both the encoder and decoder in NMT models. This approach utilizes adversarial learning to achieve a perturbation-invariant encoder. In the context of data augmentation, numerous methods have sought to enhance the robustness of NMT models by augmenting training data with monolingual corpora.",
    '6 Conclusion': "The proposed adversarial stability training aims to enhance the robustness of NMT models by training both the encoder and decoder to handle input perturbations consistently. Two approaches for constructing perturbed data are introduced to adversarially train the encoder and stabilize the decoder. Experimental results on Chinese-English, English-German, and English-French translation tasks demonstrate that the proposed approach effectively improves both robustness and translation performance. To broaden the applicability, further evaluations are suggested in the context of natural noise present in practical applications, such as homonyms in simultaneous translation systems."
}

# Function to calculate scores for a section
def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  # Define smoothing function
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
    meteor_score_val = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_val, rouge1, rouge2, rougel

# Example text 2
generated_summary = top_ranked_sentences  # Use top-ranked sentences directly

# Calculate scores for each section using top-ranked sentences
scores_top_ranked = {}

for section, sentences in generated_summary.items():
    section_text = ' '.join(sentences)

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    scores_top_ranked[section] = {
        'BLEU Score': bleu_score,
        'METEOR Score': meteor_score_val,
        'ROUGE-1 F-measure': rouge1,
        'ROUGE-2 F-measure': rouge2,
        'ROUGE-L F-measure': rougel
    }

# Display scores for each section using top-ranked sentences
for section, scores in scores_top_ranked.items():
    print(f"Scores for Section '{section}' (LSA has been applied on the generated summary):")
    for metric, value in scores.items():
        print(f"{metric}: {value}")
    print()

Scores for Section 'Abstract' (LSA has been applied on the generated summary):
BLEU Score: 0.24033150402998318
METEOR Score: 0.43259591601015546
ROUGE-1 F-measure: 0.6232558139534884
ROUGE-2 F-measure: 0.36619718309859156
ROUGE-L F-measure: 0.40930232558139534

Scores for Section '1 Introduction' (LSA has been applied on the generated summary):
BLEU Score: 0.05328448880095941
METEOR Score: 0.3161929322413354
ROUGE-1 F-measure: 0.48484848484848486
ROUGE-2 F-measure: 0.17557251908396948
ROUGE-L F-measure: 0.19696969696969696

Scores for Section '2 Background' (LSA has been applied on the generated summary):
BLEU Score: 0.0911142939497208
METEOR Score: 0.33127475792253525
ROUGE-1 F-measure: 0.4285714285714286
ROUGE-2 F-measure: 0.14388489208633093
ROUGE-L F-measure: 0.2571428571428572

Scores for Section '3 Approach' (LSA has been applied on the generated summary):
BLEU Score: 0.013076745800729542
METEOR Score: 0.13035241279131865
ROUGE-1 F-measure: 0.23170731707317072
ROUGE-2 F-measure: 

##### GLOVE Embedding

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/42.8 MB 2.3 MB/s eta 0:00:19
     ---------------------------------------- 0.3/42.8 MB 3.7 MB/s eta 0:00:12
     ---------------------------------------- 0.4/42.8 MB 3.4 MB/s eta 0:00:13
      --------------------------------------- 0.6/42.8 MB 3.5 MB/s eta 0:00:13
      --------------------------------------- 0.7/42.8 MB 3.2 MB/s eta 0:00:14
     - -------------------------------------- 1.4/42.8 MB 5.0 MB/s eta 0:00:09
     - -------------------------------------- 1.5/42.8 MB 4.8 MB/s eta 0:00:09
     - -------------------------------------- 1.7/42.8 MB 4.6 MB/s eta 0:00:09
     - -------------------------------------- 2.1/42.8 MB 5.0 MB/s eta 0:00:09
     -- ---------------------------------

### Similarity scores using GLOVE embeddings

In [None]:
# Using Glove for sections with entire text( Query is being matched with the sections but the sections have all the text)

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model with GloVe vectors
nlp = spacy.load("en_core_web_md")

def text_similarity_with_glove(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Use spacy to get GloVe vectors for the processed texts
    vector1 = nlp(text1_processed).vector
    vector2 = nlp(text2_processed).vector

    # Calculate the cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores_glove = {}
for section in sections:
    similarity_score = text_similarity_with_glove(section_extraction[sections.index(section)], text2)
    similarity_scores_glove[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores_glove.items():
    print(f"Similarity score with Section with entire text and Glove '{section}': {score}")

Similarity score with Section with entire text and Glove 'Abstract': 0.8609182238578796
Similarity score with Section with entire text and Glove '1 Introduction': 0.8844263553619385
Similarity score with Section with entire text and Glove '2 Background': 0.8665162920951843
Similarity score with Section with entire text and Glove '3 Approach': 0.8512067794799805
Similarity score with Section with entire text and Glove '4 Experiments': 0.870236873626709
Similarity score with Section with entire text and Glove '5 Related Work': 0.8806079030036926
Similarity score with Section with entire text and Glove '6 Conclusion': 0.8750537633895874


In [None]:
# Using Glove with only top ranked sentences(Matching the query to the sections and LSA has been applied)

# Load the pre-trained model with GloVe vectors
nlp = spacy.load("en_core_web_md")

def text_similarity_with_glove(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Use spacy to get GloVe vectors for the processed texts
    vector1 = nlp(text1_processed).vector
    vector2 = nlp(text2_processed).vector

    # Calculate the cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores_glove = {}
for section, top_sentence in zip(sections, top_ranked_sentences):
    similarity_score = text_similarity_with_glove(top_sentence, text2)
    similarity_scores_glove[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores_glove.items():
    print(f"Similarity score with Section using LSA and Glove '{section}': {score}")

Similarity score with Section using LSA and Glove 'Abstract': 0.5400400757789612
Similarity score with Section using LSA and Glove '1 Introduction': 0.24933794140815735
Similarity score with Section using LSA and Glove '2 Background': 0.20433813333511353
Similarity score with Section using LSA and Glove '3 Approach': 0.2322176843881607
Similarity score with Section using LSA and Glove '4 Experiments': 0.2505641579627991
Similarity score with Section using LSA and Glove '5 Related Work': 0.42708203196525574
Similarity score with Section using LSA and Glove '6 Conclusion': 0.1734970659017563


### Measuring Similarity Scores of user query with Title of pdf

##### GLOVE embedding

In [None]:
import spacy

# Load the spaCy model with GloVe vectors
nlp = spacy.load("en_core_web_md")

# Your text2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Function to calculate similarity score between title and text2 using GloVe vectors
def glove_similarity(title, text2):
    # Ensure that the input is a string
    title = str(title)
    text2 = str(text2)

    # Process the texts using spaCy
    doc_title = nlp(title)
    doc_text2 = nlp(text2)

    # Calculate the similarity between doc_title and doc_text2
    similarity = doc_title.similarity(doc_text2)

    return similarity

# Calculate similarity between clean_title and text2
similarity_score = glove_similarity(clean_title, text2)

# Display similarity score
print(f"Similarity score with Title '{clean_title}' using GloVe: {similarity_score}")

Similarity score with Title 'towards robust neural machine translation yong cheng zhaopeng tu fandong meng junjie zhai yang liuy' using GloVe: 0.4928518715080658


##### TF-IDF embedding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_md")

# Your text2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Function to calculate similarity score between title and text2 using TF-IDF
def tfidf_similarity(title, text2):
    # Ensure that the input is a string
    title = str(title)
    text2 = str(text2)

    # Process the texts using spaCy
    doc_title = nlp(title)
    doc_text2 = nlp(text2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector_title = vectorizer.fit_transform([title])
    vector_text2 = vectorizer.transform([text2])

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector_title, vector_text2)[0][0]

    return similarity

# Calculate similarity between clean_title and text2 using TF-IDF
similarity_score_tfidf = tfidf_similarity(clean_title, text2)

# Display similarity score
print(f"Similarity score with Title '{clean_title}' using TF-IDF: {similarity_score_tfidf}")

Similarity score with Title 'towards robust neural machine translation yong cheng zhaopeng tu fandong meng junjie zhai yang liuy' using TF-IDF: 0.421637021355784


### Using BART for Summarization

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import spacy

# Load spaCy English model with word embeddings
nlp = spacy.load('en_core_web_md')

# Load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def extract_sentences_from_sections(sections):
    section_sentences = {}

    for idx, text in enumerate(sections):
        # Use spaCy to extract sentences
        doc = nlp(text)

        # Extract individual sentences
        sentences = [sent.text for sent in doc.sents]

        # Store sentences in the dictionary
        section_sentences[f"Section {idx + 1}"] = sentences

    return section_sentences

def extract_section_name(section_text):
    section_name = section_text.split('\n')[0].strip()
    return section_name

def summarize_sentences(sentences, max_summary_length=1000):
    # Concatenate selected sentences
    concatenated_text = ' '.join(sentences)

    # Encode and generate summary
    inputs = tokenizer.encode("summarize: " + concatenated_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_summary_length, min_length=int(max_summary_length/5),
                                 length_penalty=10.0, num_beams=4, early_stopping=True)

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def select_top_sentences(section_sentences, num_top_sentences=5):
    # Sort sentences by some criterion (e.g., semantic similarity, importance, etc.)
    # Here, we simply sort by length for demonstration purposes
    sorted_sentences = sorted(section_sentences, key=len, reverse=True)

    # Select the top sentences
    top_sentences = sorted_sentences[:num_top_sentences]

    return top_sentences

# Extract sentences from sections
section_sentences = extract_sentences_from_sections(section_extraction)

# Generate summaries for each section based on semantic similarity
section_summaries = {}
for section, sentences in section_sentences.items():
    # Extract the section name using the heuristic
    section_name = extract_section_name('\n'.join(sentences))

    # Select the top sentences based on some criterion
    top_sentences = select_top_sentences(sentences, num_top_sentences=5)

    # Generate summary for the selected sentences
    section_summary = summarize_sentences(top_sentences)

    # Store the section summary
    section_summaries[section_name] = section_summary

    # Print the section summary with the section name
    print(f"Summary for {section_name}:\n{section_summary}\n")


  from .autonotebook import tqdm as notebook_tqdm



Summary for Abstract:
Small perturbations in the input canseverely distort intermediate representa-tions and thus impact translation quality. We propose to improve the robustness of NMT models with adver-                sarial stability training. The basic idea is to make both the encoder and decoder in                NMT models robust against input pertur-                bations by enabling them to behave sim-                ilarly for the original input and its per-                turbed counterpart. We present our results on Chinese-English, English-German and English-French translation tasks in the next issue of the Journal of Machine Translation (JMT) (http://www.jtimes.com/2013/01/29/science-translating-tasks-in-chinese-English-German-and-french.html#storylink=cpy. Back to the page you came from. Click here to read the rest of the article. The first part of this article was published in the journal of JMT (January 2013).

Summary for 1 Introduction:
Neural machine translation (NM

In [None]:
# Define your query
query = "Neural Machine Translation; Attention Mechanism, Language Translation."

# Calculate similarity with each section header
similarity_scores = {}
for section, summary in section_summaries.items():
    # Calculate similarity score between the summary and the query
    similarity_score = text_similarity(summary, query)

    # Store the similarity score for each section
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
    print(f"Similarity score with '{section}' using summaries from BART: {score}")


Similarity score with 'Abstract' using summaries from BART: 0.15517288448251548
Similarity score with '1 Introduction' using summaries from BART: 0.15341141561162125
Similarity score with '2 Background' using summaries from BART: 0.11785113019775796
Similarity score with '3 Approach' using summaries from BART: 0.060858061945018464
Similarity score with '4 Experiments' using summaries from BART: 0.06274558051381587
Similarity score with '5 Related Work' using summaries from BART: 0.0897122608032513
Similarity score with '6 Conclusion' using summaries from BART: 0.05063696835418335


In [None]:
# Calculate the average similarity score
total_similarity_score = sum(similarity_scores.values())
average_similarity_score = total_similarity_score / len(similarity_scores)

print(f"Average similarity score for the query '{query}': {average_similarity_score}")

Average similarity score for the query 'Neural Machine Translation; Attention Mechanism, Language Translation.': 0.09862690027259481


In [None]:
# Calculate the average similarity score for BART summaries
average_similarity_bart = sum(similarity_scores.values()) / len(similarity_scores)

# Table for similarity score of sections with top ranked sentences
table_combined = PrettyTable()
table_combined.field_names = ["Section", "Similarity Score (BART)"]

# Add similarity scores from BART summaries to the table
for section, score in similarity_scores.items():
    table_combined.add_row([section, f"{score:.6f}"])

# Add the average similarity score for BART summaries to the table
table_combined.add_row(["Average Similarity (BART)", f"{average_similarity_bart:.6f}"])

# Print the table
print(table_combined)

+---------------------------+-------------------------+
|          Section          | Similarity Score (BART) |
+---------------------------+-------------------------+
|          Abstract         |         0.155173        |
|       1 Introduction      |         0.153411        |
|        2 Background       |         0.117851        |
|         3 Approach        |         0.060858        |
|       4 Experiments       |         0.062746        |
|       5 Related Work      |         0.089712        |
|        6 Conclusion       |         0.050637        |
| Average Similarity (BART) |         0.098627        |
+---------------------------+-------------------------+


In [None]:
# Define reference summaries for each section
reference_summaries = {
    'Abstract': "introduces the challenge of input perturbations negatively affecting the intermediate representations and subsequently impacting the translation quality of Neural Machine Translation (NMT) models. The proposed solution in the paper is adversarial stability training, aimed at enhancing the robustness of NMT models. The core concept involves making both the encoder and decoder of NMT models resilient to input perturbations, ensuring similar behavior for both the original input and its perturbed counterpart. The experimental findings, conducted on Chinese-English, English-German, and English-French translation tasks, demonstrate that the proposed approaches not only outperform strong NMT systems but also enhance the overall robustness of NMT models.",
    '1 Introduction': "the progress of Neural Machine Translation (NMT) models in learning representations through a unified neural network. Despite their success, the paper addresses the non-robustness of NMT models, showcasing a critical problem through an example where replacing a word with its synonym leads to significant translation errors. The analogy of the butterfly effect is used to illustrate how small input changes can cause drastic alterations in the output. An investigation involving synonym replacements in a test set demonstrates that 69.74% of translations are altered, and the BLEU score drops to 79.01, emphasizing the models' sensitivity to small input perturbations. This sensitivity limits the applicability of NMT models to tasks requiring robust performance on noisy inputs.",
    '2 Background': "The provided text describes the standard Neural Machine Translation (NMT) framework, emphasizing its end-to-end nature that optimizes the translation probability from a source sentence (x) to a target sentence (y). The model includes an encoder, which converts the source sentence into hidden representations (Hx), and a decoder, generating target words based on these representations. The training objective involves minimizing the negative log-likelihood of the training corpus. However, due to the vulnerability of deep neural networks, small input perturbations significantly impact translation results. To address this, the paper proposes an adversarial stability training approach, aiming to enhance the robustness of encoder representations against noisy perturbations. The architecture involves a discriminator and perturbations to achieve this stability.",
    '3 Approach': "The proposed approach focuses on maintaining consistent behaviors in the NMT model for both the source sentence (x) and its perturbed counterpart (x0). The encoder and decoder are trained to be perturbation-invariant, ensuring that small changes in input do not significantly affect the translation output. The architecture involves constructing perturbed sentences from the source sentence and introducing two additional objectives: Linv encourages the encoder to output similar representations for x and x0, while Lnoisy guides the decoder to generate output y given the noisy input x0. The training objective combines these objectives with the original translation task, promoting both stability and good translation performance.",
    '4 Experiments': "The evaluation of adversarial stability training was conducted on translation tasks for various language pairs, with 4-gram BLEU scores reported. For Chinese-English, the LDC corpus with 1.25M sentence pairs was used, and the NIST datasets served as test sets. English-German utilized the WMT 14 corpus, and English-French employed the IWSLT corpus, collected from TED talks, to assess non-normative text. The baseline system was an in-house NMT model with a two-layer RNN architecture, GRU gating mechanism, layer normalization, and dropout. Adversarial stability training involved lexical-level and feature-level perturbations, denoted as ASTlexical and ASTfeature. ",
    '5 Related Work': "In the realm of adversarial learning, the influence of Generative Adversarial Networks (GAN) and its derivatives has been extensive in computer vision and natural language processing. While prior work has used adversarial examples to attack and defend networks, the proposed adversarial stability training distinctively aims to stabilize both the encoder and decoder in NMT models. This approach utilizes adversarial learning to achieve a perturbation-invariant encoder. In the context of data augmentation, numerous methods have sought to enhance the robustness of NMT models by augmenting training data with monolingual corpora.",
    '6 Conclusion': "The proposed adversarial stability training aims to enhance the robustness of NMT models by training both the encoder and decoder to handle input perturbations consistently. Two approaches for constructing perturbed data are introduced to adversarially train the encoder and stabilize the decoder. Experimental results on Chinese-English, English-German, and English-French translation tasks demonstrate that the proposed approach effectively improves both robustness and translation performance. To broaden the applicability, further evaluations are suggested in the context of natural noise present in practical applications, such as homonyms in simultaneous translation systems."
}

# Update reference_summaries keys to match the format in generated_summaries_bart
updated_reference_summaries = {f"Section {idx + 1}": summary for idx, summary in enumerate(reference_summaries.values())}

# Function to calculate scores for a section
def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  # Define smoothing function
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
    meteor_score_val = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_val, rouge1, rouge2, rougel

# Example text 2
generated_summary = section_summaries  # Use section summaries generated by BART

# Calculate scores for each section using BART-generated summaries
scores_bart_generated = {}

for section, summary in generated_summary.items():
    section_text = summary  # Use the summary directly

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    scores_bart_generated[section] = {
        'BLEU Score': bleu_score,
        'METEOR Score': meteor_score_val,
        'ROUGE-1 F-measure': rouge1,
        'ROUGE-2 F-measure': rouge2,
        'ROUGE-L F-measure': rougel
    }

# Display scores for each section using BART-generated summaries
for section, scores in scores_bart_generated.items():
    print(f"Scores for Section '{section}' (BART-Generated Summaries):")
    for metric, value in scores.items():
        print(f"{metric}: {value}")
    print()


Scores for Section 'Abstract' (BART-Generated Summaries):
BLEU Score: 0.1551165129197201
METEOR Score: 0.39366334394069474
ROUGE-1 F-measure: 0.49382716049382713
ROUGE-2 F-measure: 0.22406639004149378
ROUGE-L F-measure: 0.35390946502057613

Scores for Section '1 Introduction' (BART-Generated Summaries):
BLEU Score: 0.061805111645554335
METEOR Score: 0.2881513286726028
ROUGE-1 F-measure: 0.4075471698113208
ROUGE-2 F-measure: 0.12927756653992395
ROUGE-L F-measure: 0.23396226415094337

Scores for Section '2 Background' (BART-Generated Summaries):
BLEU Score: 0.10118466532025051
METEOR Score: 0.3021960805378741
ROUGE-1 F-measure: 0.435483870967742
ROUGE-2 F-measure: 0.13821138211382114
ROUGE-L F-measure: 0.23387096774193553

Scores for Section '3 Approach' (BART-Generated Summaries):
BLEU Score: 0.03210590969309506
METEOR Score: 0.303743963127016
ROUGE-1 F-measure: 0.4152542372881356
ROUGE-2 F-measure: 0.13675213675213677
ROUGE-L F-measure: 0.1864406779661017

Scores for Section '4 Experim

In [None]:
# Calculate average similarity score for top-ranked sentences of LSA
average_similarity_top_ranked = sum(similarity_scores_top_ranked.values()) / len(similarity_scores_top_ranked)

# Table for similarity score of sections with top ranked sentences of LSA
table_top_ranked = PrettyTable()
table_top_ranked.field_names = ["Section", "Similarity Score"]

for section, score in similarity_scores_top_ranked.items():
    table_top_ranked.add_row([section, f"{score:.6f}"])

# Add average similarity score for top-ranked sentences of LSA to the table
table_top_ranked.add_row(["Average Similarity (LSA)", f"{average_similarity_top_ranked:.6f}"])

# Calculate average similarity score for BART summaries
average_similarity_bart = sum(similarity_scores.values()) / len(similarity_scores)

# Table for similarity score of sections with BART summaries
table_combined = PrettyTable()
table_combined.field_names = ["Section", "Similarity Score (BART)"]

# Add similarity scores from BART summaries to the table
for section, score in similarity_scores.items():
    table_combined.add_row([section, f"{score:.6f}"])

# Add the average similarity score for BART summaries to the table
table_combined.add_row(["Average Similarity (BART)", f"{average_similarity_bart:.6f}"])

# Print the tables
print(table_top_ranked)
print("\n")  # Separate the tables with a newline
print(table_combined)

# Write tables to CSV file
with open('lsa_scores.csv', 'w') as file:
    file.write(str(table_top_ranked))
    file.write("\n\n")  # Add a newline between the tables
    file.write(str(table_combined))


+--------------------------+------------------+
|         Section          | Similarity Score |
+--------------------------+------------------+
|         Abstract         |     0.219694     |
|      1 Introduction      |     0.150997     |
|       2 Background       |     0.058666     |
|        3 Approach        |     0.000000     |
|      4 Experiments       |     0.099015     |
|      5 Related Work      |     0.081650     |
|       6 Conclusion       |     0.000000     |
| Average Similarity (LSA) |     0.087146     |
+--------------------------+------------------+


+---------------------------+-------------------------+
|          Section          | Similarity Score (BART) |
+---------------------------+-------------------------+
|          Abstract         |         0.155173        |
|       1 Introduction      |         0.153411        |
|        2 Background       |         0.117851        |
|         3 Approach        |         0.060858        |
|       4 Experiments       | 

### Converting the similarity output into a Python package, Convert and show into CSV

In [None]:
import csv

# Save tables to CSV file
csv_file_path = 'output.csv'
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)

    # Write the top-ranked sentences table
    writer.writerow(['Table for LSA'])
    writer.writerow(['Section', 'Similarity Score(LSA)'])
    writer.writerow(['Title',f"{similarity_score_tfidf:.6f}"])
    for section, score in similarity_scores_top_ranked.items():
        writer.writerow([section, f"{score:.6f}"])
    writer.writerow(["Average Similarity (LSA)", f"{average_similarity_top_ranked:.6f}"])
    writer.writerow([])

    # Write the BART summaries table
    writer.writerow(['Table for BART summaries'])
    writer.writerow(['Section', 'Similarity Score (BART)'])
    for section, score in similarity_scores.items():
        writer.writerow([section, f"{score:.6f}"])
    writer.writerow(["Average Similarity (BART)", f"{average_similarity_bart:.6f}"])