### Extraction of title from pdf

In [35]:
# importing necessary libraries and packages
import fitz
import re
import nltk
import string
import PyPDF2

In [36]:
# Input pdf path
pdf_path = 'satellite_imagery.pdf'

In [37]:
def extract_title(pdf_file):
    with open(pdf_file, 'rb') as pdf_file_obj:
        pdf_reader = PyPDF2.PdfReader(pdf_file_obj)

        # Attempt to extract the title from the first line of the text
        pdf_text = pdf_reader.pages[0].extract_text()
        line = pdf_text.split('\n')[:2]
        first_line = ' '.join(line).strip()

        # If the first line is not empty, use it as the title
        if first_line:
            pdf_title = first_line
        elif pdf_title == '':
            # If the first line is empty, use the PDF file name
            pdf_title = pdf_file.split('/')[-1]
        else:
            document_info = pdf_reader.metadata
            pdf_title = document_info.get('/Title')

        return pdf_title

In [38]:
title = extract_title(pdf_path)

In [39]:
title, type(title)

('Using Convolutional Networks and Satellite Imagery to Identify Pa/t_terns in Urban Environments at a Large Scale',
 str)

In [40]:
# Getting the details of fontname and fontsize attribute for each line within pdfs

import fitz

def scrape(filePath):
    results = [] # list of tuples that store the information as (text, font size, font name)
    pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
    for page in pdf:
        dict = page.get_text("dict")
        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                            results.append((lines['text'], lines['size'], lines['font']))
                            # lines['text'] -> string, lines['size'] -> font size, lines['font'] -> font name
    pdf.close()
    return results

In [41]:
output = scrape(pdf_path)
output

[('Using Convolutional Networks and Satellite Imagery to Identify',
  16.752822875976562,
  'LinBiolinumTB'),
 ('Paterns in Urban Environments at a Large Scale',
  16.752822875976562,
  'LinBiolinumTB'),
 ('Adrian Albert', 11.633963584899902, 'LinLibertineT'),
 ('∗', 8.53152847290039, 'LinLibertineT'),
 ('Massachusets Institute of Technology', 9.597465515136719, 'LinLibertineT'),
 ('Civil and Environmental Engineering', 9.690055847167969, 'LinLibertineT'),
 ('77 Massachusets Ave', 9.694904327392578, 'LinLibertineT'),
 ('Cambridge, MA 02139', 9.694904327392578, 'LinLibertineT'),
 ('adalbert@mit.edu', 9.694904327392578, 'LinLibertineT'),
 ('Jasleen Kaur', 11.633963584899902, 'LinLibertineT'),
 ('Philips Lighting Research North', 9.694904327392578, 'LinLibertineT'),
 ('America', 9.694904327392578, 'LinLibertineT'),
 ('2 Canal Park', 9.694904327392578, 'LinLibertineT'),
 ('Cambridge, MA 02141', 9.694904327392578, 'LinLibertineT'),
 ('jasleen.kaur1@philips.com', 9.694904327392578, 'LinLiber

In [42]:
## Validation of correct title extraction
if title is None or title == '':
      new_title = ''
      max_font_size = max(output, key=lambda x: x[1])[1]
      elements_with_max_font = [element for element in output if element[1] == max_font_size]

      print("PDF title is:\n")
      for element in elements_with_max_font:
            new_title += ' ' + element[0]
      print(new_title)
else:
      new_title = title
      print(new_title)

Using Convolutional Networks and Satellite Imagery to Identify Pa/t_terns in Urban Environments at a Large Scale


In [43]:
# packages required for text pre-processing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords

from nltk.tokenize import sent_tokenize, word_tokenize

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
# Tokenization, lower-case conversion and stop-word removal

def preprocess_text(text):

  sentences = sent_tokenize(text)  # Split into sentences
  tokens = [word_tokenize(sentence.lower()) for sentence in sentences]  # Tokenize each sentence
  preprocessed_tokens = []
  for sentence_tokens in tokens:
    filtered_tokens = [token for token in sentence_tokens if token not in stop_words]  # Remove stop words
    preprocessed_tokens.extend(filtered_tokens)
  return preprocessed_tokens

In [45]:
clean_title = preprocess_text(new_title)
print(clean_title)

['using', 'convolutional', 'networks', 'satellite', 'imagery', 'identify', 'pa/t_terns', 'urban', 'environments', 'large', 'scale']


In [46]:
# Removing punctuations from title

import string

def remove_punctuations(text):

  punctuations = string.punctuation + "/_*"

  no_punct_text = " ".join([c for c in text if c not in punctuations])

  return no_punct_text

clean_title = remove_punctuations(clean_title)
print(f"Text without punctuations: {clean_title}")

Text without punctuations: using convolutional networks satellite imagery identify pa/t_terns urban environments large scale


### Extraction of text from other sections of pdf

##### Extracting section headers using fontsize and fontname attributes

In [47]:
# Determining the font size and font name of 'Abstract' using its index position

abstract_index = next(i for i, item in enumerate(output) if item[0] == "Abstract " or item[0] == 'ABSTRACT ' or item[0] == "Abstract" or item[0] == 'ABSTRACT'  or item[0] == "Abstract: ")

font_size = output[abstract_index][1]
font_name = output[abstract_index][2]
print(font_size, font_name)

10.615971565246582 LinLibertineTB


In [48]:
# Filtering out only those items which have similar properties as 'Abstract' and considering them as other section headers

filtered_items = [item for item in output if item[1] == font_size and item[2] == font_name]
filtered_items = filtered_items[:-1] # Excluding references from headers
filtered_items

[('ABSTRACT', 10.615971565246582, 'LinLibertineTB'),
 ('CCS CONCEPTS', 10.615971565246582, 'LinLibertineTB'),
 ('KEYWORDS', 10.615971565246582, 'LinLibertineTB'),
 ('1', 10.615971565246582, 'LinLibertineTB'),
 ('INTRODUCTION', 10.615971565246582, 'LinLibertineTB'),
 ('2', 10.615971565246582, 'LinLibertineTB'),
 ('LITERATURE', 10.615971565246582, 'LinLibertineTB'),
 ('2.1', 10.615971565246582, 'LinLibertineTB'),
 ('Existing land use benchmark datasets',
  10.615971565246582,
  'LinLibertineTB'),
 ('3', 10.615971565246582, 'LinLibertineTB'),
 ('THE URBAN ENVIRONMENTS DATASET', 10.615971565246582, 'LinLibertineTB'),
 ('3.1', 10.615971565246582, 'LinLibertineTB'),
 ('Urban Atlas: a standard in land use analysis',
  10.615971565246582,
  'LinLibertineTB'),
 ('3.2', 10.615971565246582, 'LinLibertineTB'),
 ('Data sampling and acquisition', 10.615971565246582, 'LinLibertineTB'),
 ('4', 10.615971565246582, 'LinLibertineTB'),
 ('EXPERIMENTAL SETUP', 10.615971565246582, 'LinLibertineTB'),
 ('4.1'

In [49]:
# Removing unwanted parts of section headers (For eg. Author names, Acknowledgements, References etc.)

new_abstract_index = next((i for i, item in enumerate(filtered_items) if item[0] == 'Abstract 'or item[0] == 'ABSTRACT'
                           or item[0] == 'Abstract' or item[0] == 'ABSTRACT '), None)

new_items = filtered_items[new_abstract_index:] if new_abstract_index is not None else filtered_items

new_con_index = next((i for i, element in enumerate(new_items) if element[0] == 'CONCLUSIONS' or element[0] == 'CONCLUSIONS '
                      or element[0] == 'Conclusions' or element[0] == 'Conclusions ' or element[0] == 'Conclusion'
                      or element[0] == 'Conclusion '), None)

new_items = new_items[:new_con_index + 1]

new_items


[('ABSTRACT', 10.615971565246582, 'LinLibertineTB'),
 ('CCS CONCEPTS', 10.615971565246582, 'LinLibertineTB'),
 ('KEYWORDS', 10.615971565246582, 'LinLibertineTB'),
 ('1', 10.615971565246582, 'LinLibertineTB'),
 ('INTRODUCTION', 10.615971565246582, 'LinLibertineTB'),
 ('2', 10.615971565246582, 'LinLibertineTB'),
 ('LITERATURE', 10.615971565246582, 'LinLibertineTB'),
 ('2.1', 10.615971565246582, 'LinLibertineTB'),
 ('Existing land use benchmark datasets',
  10.615971565246582,
  'LinLibertineTB'),
 ('3', 10.615971565246582, 'LinLibertineTB'),
 ('THE URBAN ENVIRONMENTS DATASET', 10.615971565246582, 'LinLibertineTB'),
 ('3.1', 10.615971565246582, 'LinLibertineTB'),
 ('Urban Atlas: a standard in land use analysis',
  10.615971565246582,
  'LinLibertineTB'),
 ('3.2', 10.615971565246582, 'LinLibertineTB'),
 ('Data sampling and acquisition', 10.615971565246582, 'LinLibertineTB'),
 ('4', 10.615971565246582, 'LinLibertineTB'),
 ('EXPERIMENTAL SETUP', 10.615971565246582, 'LinLibertineTB'),
 ('4.1'

##### Formatting the headers to match the actual pdf headers

In [50]:
# Formatting the name of the headers to match the pattern given in original pdfs

output = []
current_element = ''

for i, item in enumerate(new_items):
    if i > 0 and item[0].isdigit() and not new_items[i - 1][0].isdigit():

        output.append(current_element.strip())
        current_element = item[0]
    else:
        current_element += ' ' + item[0]

# Add the last element to the output
if current_element:
    output.append(current_element.strip())

print(output)

['ABSTRACT CCS CONCEPTS KEYWORDS', '1 INTRODUCTION', '2 LITERATURE 2.1 Existing land use benchmark datasets', '3 THE URBAN ENVIRONMENTS DATASET 3.1 Urban Atlas: a standard in land use analysis 3.2 Data sampling and acquisition', '4 EXPERIMENTAL SETUP 4.1 Neural network architectures and training 4.2 Comparing urban environments', '5 RESULTS AND DISCUSSION 5.1 Classifcation results 5.2 Comparing urban environments', '6 CONCLUSIONS']


In [51]:
# Removing other subsections falling under 'Abstract' for less complication

new_output = []

if(len(output[0].split(' '))>1):
    words = output[0].split(' ')
    new_output.append(words[0])

else:
    new_output = output

new_output

['ABSTRACT']

In [52]:
# Making a cleaner list of all section headers for individual pdfs

for i in range(1,len(output)):
  if(output != new_output):
    element = output[i].split('.')[0][:-1]
    element_new = ''
    if(output[i].split('.')[0][-1].isdigit()):
        element_new = element
    else:
        element_new = output[i].split('.')[0]

    new_output.append(element_new.rstrip())
  else:
     pass

new_output

['ABSTRACT',
 '1 INTRODUCTION',
 '2 LITERATURE',
 '3 THE URBAN ENVIRONMENTS DATASET',
 '4 EXPERIMENTAL SETUP',
 '5 RESULTS AND DISCUSSION',
 '6 CONCLUSIONS']

### Extract the entire text of pdf

In [53]:
# Parse the entire text from individual pdf

def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file and returns it as a string."""

    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        full_text = ""
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            full_text += page_text

    return full_text

In [54]:
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

Using Convolutional Networks and Satellite Imagery to Identify
Pa/t_terns in Urban Environments at a Large Scale
Adrian Albert∗
Massachuse/t_ts Institute of Technology
Civil and Environmental Engineering
77 Massachuse/t_ts Ave
Cambridge, MA 02139
adalbert@mit.eduJasleen Kaur
Philips Lighting Research North
America
2 Canal Park
Cambridge, MA 02141
jasleen.kaur1@philips.comMarta C. Gonz ´alez
Massachuse/t_ts Institute of Technology
Civil and Environmental Engineering
77 Massachuse/t_ts Ave
Cambridge, MA 02139
martag@mit.edu
ABSTRACT
Urban planning applications (energy audits, investment, etc.) re-
quire an understanding of built infrastructure and its environment,
i.e., both low-level, physical features (amount of vegetation, build-
ing area and geometry etc.), as well as higher-level concepts such
as land use classes (which encode expert understanding of socio-
economic end uses). /T_his kind of data is expensive and labor-
intensive to obtain, which limits its availability (particularl

In [55]:
# Omitting 'References' section from extracted text
def remove_references(pdf_text):
    references_pattern = re.compile(r'References\s*[\r\n]+.*', re.DOTALL | re.IGNORECASE)

    text_without_references = re.sub(references_pattern, '', pdf_text)

    return text_without_references

In [56]:
extracted_text = remove_references(extracted_text)
extracted_text

'Using Convolutional Networks and Satellite Imagery to Identify\nPa/t_terns in Urban Environments at a Large Scale\nAdrian Albert∗\nMassachuse/t_ts Institute of Technology\nCivil and Environmental Engineering\n77 Massachuse/t_ts Ave\nCambridge, MA 02139\nadalbert@mit.eduJasleen Kaur\nPhilips Lighting Research North\nAmerica\n2 Canal Park\nCambridge, MA 02141\njasleen.kaur1@philips.comMarta C. Gonz ´alez\nMassachuse/t_ts Institute of Technology\nCivil and Environmental Engineering\n77 Massachuse/t_ts Ave\nCambridge, MA 02139\nmartag@mit.edu\nABSTRACT\nUrban planning applications (energy audits, investment, etc.) re-\nquire an understanding of built infrastructure and its environment,\ni.e., both low-level, physical features (amount of vegetation, build-\ning area and geometry etc.), as well as higher-level concepts such\nas land use classes (which encode expert understanding of socio-\neconomic end uses). /T_his kind of data is expensive and labor-\nintensive to obtain, which limits its

In [57]:
# Removing non-ascii characters, urls from extracted text

text_without_non_ascii = re.sub(r"[^\x00-\x7F]", "", extracted_text)
text_without_non_ascii = re.sub(r",.-/:","",text_without_non_ascii)
cleaned_text = re.sub(r"h/t_tps?://[^\s]+","",text_without_non_ascii)

print(cleaned_text)

Using Convolutional Networks and Satellite Imagery to Identify
Pa/t_terns in Urban Environments at a Large Scale
Adrian Albert
Massachuse/t_ts Institute of Technology
Civil and Environmental Engineering
77 Massachuse/t_ts Ave
Cambridge, MA 02139
adalbert@mit.eduJasleen Kaur
Philips Lighting Research North
America
2 Canal Park
Cambridge, MA 02141
jasleen.kaur1@philips.comMarta C. Gonz alez
Massachuse/t_ts Institute of Technology
Civil and Environmental Engineering
77 Massachuse/t_ts Ave
Cambridge, MA 02139
martag@mit.edu
ABSTRACT
Urban planning applications (energy audits, investment, etc.) re-
quire an understanding of built infrastructure and its environment,
i.e., both low-level, physical features (amount of vegetation, build-
ing area and geometry etc.), as well as higher-level concepts such
as land use classes (which encode expert understanding of socio-
economic end uses). /T_his kind of data is expensive and labor-
intensive to obtain, which limits its availability (particularly 

In [58]:
# Copying the header names within a new variable

sections = new_output
len(sections)

7

In [59]:
# Extraction of text under individual sections of pdfs using the header names

section_extraction = []

# Initialize an empty list to store the sections found in the text
updated_sections = []

for i in range(len(sections)-1):
    start_index = cleaned_text.find(sections[i])
    end_index = cleaned_text.find(sections[i+1])

    if start_index != -1 and end_index != -1:
        extraction = cleaned_text[start_index:end_index].strip()
        print("Markers found in the text.",sections[i])
        section_extraction.append(extraction)
        updated_sections.append(sections[i])
    else:
        print("Markers not found in the text.",sections[i])


# Extract the last section separately
last_start_index = cleaned_text.find(sections[-1])
if last_start_index != -1:
    last_extraction = cleaned_text[last_start_index:].strip()
    print("Markers found in the text.", sections[-1])
    section_extraction.append(last_extraction)
    updated_sections.append(sections[-1])
else:
    print("Markers not found in the text.", sections[-1])

# Update the sections list with only the sections found in the text
sections = updated_sections

Markers found in the text. ABSTRACT
Markers found in the text. 1 INTRODUCTION
Markers found in the text. 2 LITERATURE
Markers found in the text. 3 THE URBAN ENVIRONMENTS DATASET
Markers found in the text. 4 EXPERIMENTAL SETUP
Markers found in the text. 5 RESULTS AND DISCUSSION
Markers found in the text. 6 CONCLUSIONS


In [60]:
sections

['ABSTRACT',
 '1 INTRODUCTION',
 '2 LITERATURE',
 '3 THE URBAN ENVIRONMENTS DATASET',
 '4 EXPERIMENTAL SETUP',
 '5 RESULTS AND DISCUSSION',
 '6 CONCLUSIONS']

In [61]:
section_extraction

['ABSTRACT\nUrban planning applications (energy audits, investment, etc.) re-\nquire an understanding of built infrastructure and its environment,\ni.e., both low-level, physical features (amount of vegetation, build-\ning area and geometry etc.), as well as higher-level concepts such\nas land use classes (which encode expert understanding of socio-\neconomic end uses). /T_his kind of data is expensive and labor-\nintensive to obtain, which limits its availability (particularly in\ndeveloping countries). We analyze pa/t_terns in land use in urban\nneighborhoods using large-scale satellite imagery data (which is\navailable worldwide from third-party providers) and state-of-the-\nart computer vision techniques based on deep convolutional neural\nnetworks. For supervision, given the limited availability of standard\nbenchmarks for remote-sensing data, we obtain ground truth land\nuse class labels carefully sampled from open-source surveys, in\nparticular the Urban Atlas land classi/f_icat

In [62]:
# Tokenization, Stop word removal from extracted pdf text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Process each elements in section_extraction

processed_text_final = [remove_stopwords(section) for section in section_extraction]
processed_text_final

['ABSTRACT Urban planning applications ( energy audits , investment , etc . ) re- quire understanding built infrastructure environment , i.e. , low-level , physical features ( amount vegetation , build- ing area geometry etc . ) , well higher-level concepts land use classes ( encode expert understanding socio- economic end uses ) . /T_his kind data expensive labor- intensive obtain , limits availability ( particularly developing countries ) . analyze pa/t_terns land use urban neighborhoods using large-scale satellite imagery data ( available worldwide third-party providers ) state-of-the- art computer vision techniques based deep convolutional neural networks . supervision , given limited availability standard benchmarks remote-sensing data , obtain ground truth land use class labels carefully sampled open-source surveys , particular Urban Atlas land classi/f_ication dataset 20 land use classes across 300 European cities . use data train compare deep architectures recently shown good p

### Applying LSA for summarizing the section extraction

In [63]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [64]:
# Vectorizing the extracted sentences
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(processed_text_final)

# 3. Create a TruncatedSVD object for LSA
lsa = TruncatedSVD(n_components = 7)

In [65]:
# 4. Perform LSA on the TF-IDF matrix

lsa_matrix = lsa.fit_transform(tfidf_matrix)
print(lsa_matrix)

[[ 0.64603997 -0.47136568  0.03123876  0.04658915  0.40592293 -0.37814123
  -0.22256712]
 [ 0.71837253 -0.35295683  0.01635817  0.01684558  0.00204705  0.09853592
   0.59084474]
 [ 0.6919466  -0.35811097  0.14093501 -0.021419   -0.20590926  0.47156586
  -0.32843872]
 [ 0.6792114   0.13962336 -0.26485975 -0.47040131 -0.3768704  -0.28917446
  -0.0457814 ]
 [ 0.50638211  0.43712987  0.73271486  0.00425183 -0.02674672 -0.11987031
   0.02283015]
 [ 0.59038982  0.52784118 -0.25551967 -0.18688852  0.44490679  0.27312397
  -0.00819559]
 [ 0.62458583  0.28506674 -0.27175444  0.64091578 -0.18313653 -0.09114582
  -0.0464696 ]]


In [66]:
# Using LSA for summarizing the text of each individual section

# Calculate LSA scores for each sentence
lsa_scores = np.sum(np.abs(lsa_matrix), axis=1)

# Group sentences by section and rank within each group
grouped_sentences = {}
for sentence, section, score in zip(processed_text_final, sections, lsa_scores):
    grouped_sentences.setdefault(section, []).append((score, sentence))

# Sentence tokenization function

def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Tokenize sentences for each section
section_sentences = {section: tokenize_sentences(' '.join(sentence[1] for sentence in sentences)) for section, sentences in grouped_sentences.items()}

# Choose the top-ranked sentences from each section
num_top_sentences = 5
top_ranked_sentences = []

for section, sentences in grouped_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence[1] for sentence in sentences[:num_top_sentences]]

    # Print the content of each sentence
    print(f"Section {section} sentences:\n", top_sentences)

Section ABSTRACT sentences:
 ['ABSTRACT Urban planning applications ( energy audits , investment , etc . ) re- quire understanding built infrastructure environment , i.e. , low-level , physical features ( amount vegetation , build- ing area geometry etc . ) , well higher-level concepts land use classes ( encode expert understanding socio- economic end uses ) . /T_his kind data expensive labor- intensive obtain , limits availability ( particularly developing countries ) . analyze pa/t_terns land use urban neighborhoods using large-scale satellite imagery data ( available worldwide third-party providers ) state-of-the- art computer vision techniques based deep convolutional neural networks . supervision , given limited availability standard benchmarks remote-sensing data , obtain ground truth land use class labels carefully sampled open-source surveys , particular Urban Atlas land classi/f_ication dataset 20 land use classes across 300 European cities . use data train compare deep archit

In [67]:
# Creating a list to store the top-5 sentences of each section

# Iterate through each section and extract the top-ranked sentences
for section, sentences in section_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence for sentence in sentences[:num_top_sentences]]

    # Append the top sentences to the result list
    top_ranked_sentences.extend(top_sentences)

    print(f"Top-ranked sentences for Section {section}:\n" + ''.join(sentence + '\n' for sentence in top_sentences) + '\n')


Top-ranked sentences for Section ABSTRACT:
use data train compare deep architectures recently shown good per- formance standard computer vision tasks ( image classi/f_ication segmentation ) , including geospatial data .
supervision , given limited availability standard benchmarks remote-sensing data , obtain ground truth land use class labels carefully sampled open-source surveys , particular Urban Atlas land classi/f_ication dataset 20 land use classes across 300 European cities .
re- quire understanding built infrastructure environment , i.e.
make dataset available ma- chine learning researchers use remote-sensing applications .
analyze pa/t_terns land use urban neighborhoods using large-scale satellite imagery data ( available worldwide third-party providers ) state-of-the- art computer vision techniques based deep convolutional neural networks .


Top-ranked sentences for Section 1 INTRODUCTION:
uses , contact owner/author ( ) .
use features extracted model perform large-scale comp

In [68]:
## Creating a dictionary for stroing top-ranked sentences

top_ranked_sentences = {}

# Iterate through each section and extract the top-ranked sentences
for section, sentences in section_sentences.items():
    # Sort sentences by score in descending order
    sentences = sorted(sentences, key=lambda x: x[0], reverse=True)

    # Choose the top-ranked sentences
    top_sentences = [sentence for sentence in sentences[:num_top_sentences]]

    # Store the top sentences in the dictionary
    top_ranked_sentences[section] = top_sentences

    # Print the top-ranked sentences for each section
    print(f"Top-ranked sentences for Section {section}:\n" + ''.join(sentence + '\n' for sentence in top_sentences) + '\n')

Top-ranked sentences for Section ABSTRACT:
use data train compare deep architectures recently shown good per- formance standard computer vision tasks ( image classi/f_ication segmentation ) , including geospatial data .
supervision , given limited availability standard benchmarks remote-sensing data , obtain ground truth land use class labels carefully sampled open-source surveys , particular Urban Atlas land classi/f_ication dataset 20 land use classes across 300 European cities .
re- quire understanding built infrastructure environment , i.e.
make dataset available ma- chine learning researchers use remote-sensing applications .
analyze pa/t_terns land use urban neighborhoods using large-scale satellite imagery data ( available worldwide third-party providers ) state-of-the- art computer vision techniques based deep convolutional neural networks .


Top-ranked sentences for Section 1 INTRODUCTION:
uses , contact owner/author ( ) .
use features extracted model perform large-scale comp

### Measuring the similarity score of each section summaries with user query

#### Using TF-IDF embedding

In [69]:
# Similarity score of query with each section containing entire text in that section before applying LSA

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
    similarity_score = text_similarity(processed_text_final[sections.index(section)], text2)
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
      print(f"Similarity score with Section '{section}': {score}")

Similarity score with Section 'ABSTRACT': 0.10327955589886438
Similarity score with Section '1 INTRODUCTION': 0.02318071425053517
Similarity score with Section '2 LITERATURE': 0.0779496096850537
Similarity score with Section '3 THE URBAN ENVIRONMENTS DATASET': 0.0
Similarity score with Section '4 EXPERIMENTAL SETUP': 0.0588744809409464
Similarity score with Section '5 RESULTS AND DISCUSSION': 0.009253707430860446
Similarity score with Section '6 CONCLUSIONS': 0.0628005481813998


In [70]:
# Similarity score of query with each section containing only top 5 ranked sentences in that section after applying LSA

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
    similarity_score = text_similarity(' '.join(top_ranked_sentences[section]), text2)
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
    print(f"Similarity score with Section {section} after LSA': {score}")

Similarity score with Section ABSTRACT after LSA': 0.0764719112901873
Similarity score with Section 1 INTRODUCTION after LSA': 0.0
Similarity score with Section 2 LITERATURE after LSA': 0.14433756729740652
Similarity score with Section 3 THE URBAN ENVIRONMENTS DATASET after LSA': 0.0
Similarity score with Section 4 EXPERIMENTAL SETUP after LSA': 0.0
Similarity score with Section 5 RESULTS AND DISCUSSION after LSA': 0.0
Similarity score with Section 6 CONCLUSIONS after LSA': 0.0


In [71]:
# Getting average similarity score of the sections with entire text without LSA applied

# Example text 2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section header
similarity_scores = []
for section_text in processed_text_final:
    similarity_score = text_similarity(section_text, text2)
    similarity_scores.append(similarity_score)

# Calculate the average similarity score
average_similarity = sum(similarity_scores) / len(similarity_scores)

print("Average Similarity Score without LSA:", average_similarity)

Average Similarity Score without LSA: 0.04790551662680855


In [72]:
# Getting average similarity score of section only with top 5 ranked sentences after LSA applied

def text_similarity(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform([text1_processed])
    vector2 = vectorizer.transform([text2_processed])

    # Calculate the cosine similarity (single value for whole texts)
    similarity = cosine_similarity(vector1, vector2)[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section header using top ranked sentences
similarity_scores = []
for section, top_sentences in top_ranked_sentences.items():
    section_text = ' '.join(top_sentences)
    similarity_score = text_similarity(section_text, text2)
    similarity_scores.append(similarity_score)

# Calculate the average similarity score
average_similarity = sum(similarity_scores) / len(similarity_scores)

print("Average Similarity Score with Top Ranked Sentences after LSA:", average_similarity)

Average Similarity Score with Top Ranked Sentences after LSA: 0.03154421122679912


In [73]:
!pip install prettytable



In [74]:
# Table for similarity score of sections without LSA

from prettytable import PrettyTable

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores = {}
for section in sections:
   
    similarity_score = text_similarity(processed_text_final[sections.index(section)], text2)
    similarity_scores[section] = similarity_score

# Display similarity scores in a table
table = PrettyTable()
table.field_names = ["Section with entire pdf text using TF-IDF", "Similarity Score"]

for section, score in similarity_scores.items():
    table.add_row([section, f"{score:.6f}"])

# Calculate and add the average similarity score to the table
if similarity_scores.values != 0 :
    average_similarity = sum(similarity_scores.values()) / len(similarity_scores)
    table.add_row(["Average Similarity", f"{average_similarity:.6f}"])
else:
    pass

# Print the table
print(table)


+-------------------------------------------+------------------+
| Section with entire pdf text using TF-IDF | Similarity Score |
+-------------------------------------------+------------------+
|                  ABSTRACT                 |     0.103280     |
|               1 INTRODUCTION              |     0.023181     |
|                2 LITERATURE               |     0.077950     |
|      3 THE URBAN ENVIRONMENTS DATASET     |     0.000000     |
|            4 EXPERIMENTAL SETUP           |     0.058874     |
|          5 RESULTS AND DISCUSSION         |     0.009254     |
|               6 CONCLUSIONS               |     0.062801     |
|             Average Similarity            |     0.047906     |
+-------------------------------------------+------------------+


In [75]:
similarity_scores.values()

dict_values([0.10327955589886438, 0.02318071425053517, 0.0779496096850537, 0.0, 0.0588744809409464, 0.009253707430860446, 0.0628005481813998])

In [76]:
# Table for similarity score of sections with top ranked sentences of LSA

from prettytable import PrettyTable

# Example text 2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity scores for each section using top-ranked sentences
similarity_scores_top_ranked = {}
for section, top_sentences in top_ranked_sentences.items():
    section_text = ' '.join(top_sentences)
    similarity_score = text_similarity(section_text, text2)
    similarity_scores_top_ranked[section] = similarity_score

# Display similarity scores in a table for top-ranked sentences
table_top_ranked = PrettyTable()
table_top_ranked.field_names = ["Section with LSA for TF-IDF", "Similarity Score"]

for section, score in similarity_scores_top_ranked.items():
    table_top_ranked.add_row([section, f"{score:.6f}"])

# Calculate and add the average similarity score to the table
average_similarity_top_ranked = sum(similarity_scores_top_ranked.values()) / len(similarity_scores_top_ranked)
table_top_ranked.add_row(["Average Similarity (Top Ranked)", f"{average_similarity_top_ranked:.6f}"])

# Print the table for top-ranked sentences
print(table_top_ranked)

+----------------------------------+------------------+
|   Section with LSA for TF-IDF    | Similarity Score |
+----------------------------------+------------------+
|             ABSTRACT             |     0.076472     |
|          1 INTRODUCTION          |     0.000000     |
|           2 LITERATURE           |     0.144338     |
| 3 THE URBAN ENVIRONMENTS DATASET |     0.000000     |
|       4 EXPERIMENTAL SETUP       |     0.000000     |
|     5 RESULTS AND DISCUSSION     |     0.000000     |
|          6 CONCLUSIONS           |     0.000000     |
| Average Similarity (Top Ranked)  |     0.031544     |
+----------------------------------+------------------+


### Evaluation Metric for validating the performance of LSA summarization

In [77]:
!pip install nltk py-rouge



In [78]:
!pip install rouge-score



In [79]:
# Getting the name of the input pdf

import os
file_name = os.path.basename(pdf_path)
file_name_without_extension = os.path.splitext(file_name)[0]
file_name_without_extension

'satellite_imagery'

In [80]:
# Fetching reference summaries (created by ChatGPT) from an external excel file
import pandas as pd
import ast

data = pd.read_excel('reference_summaries.xlsx')

pdf_name = file_name_without_extension + '.pdf'

row = data[data['pdf_name'] == pdf_name]

if not row.empty:
    summary = row['Summaries'].iloc[0]
    try:
        summary_dict = ast.literal_eval(summary)
        print("Summary dictionary for", pdf_name, ":", summary_dict)
    except ValueError:
        print("Error: Summary string for", pdf_name, "is not in a valid dictionary format.")
else:
    print("PDF name", pdf_name, "not found in the Excel file.")

Summary dictionary for satellite_imagery.pdf : {'ABSTRACT': 'Urban planning applications, including energy audits and investments, necessitate a comprehensive understanding of built infrastructure and its environment, encompassing both low-level physical features and higher-level concepts like socio-economic land use classes. Obtaining such data is costly and labor-intensive, posing challenges, especially in developing countries. Our approach involves analyzing land use patterns in urban areas using large-scale satellite imagery and advanced computer vision techniques. We leverage deep convolutional neural networks, train models on a dataset derived from open-source surveys, specifically the Urban Atlas classification dataset, and make our findings and dataset accessible to machine learning researchers for remote-sensing applications.', '1 INTRODUCTION': 'Urban land use classification plays a crucial role in various applications, including urban planning, zoning, business permits, real

In [81]:
# Evaluation of section summaries without LSA applied and the reference summary has been taken from ChatGPT

from nltk.translate import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Define reference summaries for each section
reference_summaries = summary_dict

def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

    meteor_score_value = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_value, rouge1, rouge2, rougel

# Initialize generated summary with the content of each section in top-ranked sentences
generated_summary = ""

# Calculate scores for each section using top-ranked sentences
bleu_scores_top_ranked = {}
meteor_scores_top_ranked = {}
rouge1_scores_top_ranked = {}
rouge2_scores_top_ranked = {}
rougel_scores_top_ranked = {}

for section, top_sentences in section_sentences.items():
    section_text = ' '.join(top_sentences)
    generated_summary += section_text + " "  # Include the content of each section

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    bleu_scores_top_ranked[section] = bleu_score
    meteor_scores_top_ranked[section] = meteor_score_val
    rouge1_scores_top_ranked[section] = rouge1
    rouge2_scores_top_ranked[section] = rouge2
    rougel_scores_top_ranked[section] = rougel

# Display scores for each section using top-ranked sentences
for section in section_sentences.keys():
    print(f"Scores for Section '{section}' (Sections have the entire text in the generated summary):")
    print("BLEU Score:", bleu_scores_top_ranked[section])
    print("METEOR Score:", meteor_scores_top_ranked[section])
    print("ROUGE-1 F-measure:", rouge1_scores_top_ranked[section])
    print("ROUGE-2 F-measure:", rouge2_scores_top_ranked[section])
    print("ROUGE-L F-measure:", rougel_scores_top_ranked[section])
    print()

# Display the generated summary
print("\nGenerated Summary with Content of Top-Ranked Sentences:")
print(generated_summary)


Scores for Section 'ABSTRACT' (Sections have the entire text in the generated summary):
BLEU Score: 0.06037853089860054
METEOR Score: 0.4013278008298755
ROUGE-1 F-measure: 0.41471571906354515
ROUGE-2 F-measure: 0.20202020202020202
ROUGE-L F-measure: 0.3612040133779264

Scores for Section '1 INTRODUCTION' (Sections have the entire text in the generated summary):
BLEU Score: 0.019231123036102856
METEOR Score: 0.3146509309106222
ROUGE-1 F-measure: 0.25232403718459495
ROUGE-2 F-measure: 0.07989347536617844
ROUGE-L F-measure: 0.18061088977423637

Scores for Section '2 LITERATURE' (Sections have the entire text in the generated summary):
BLEU Score: 0.017192562034481027
METEOR Score: 0.24684313636875915
ROUGE-1 F-measure: 0.1555806087936866
ROUGE-2 F-measure: 0.07231638418079096
ROUGE-L F-measure: 0.11048478015783539

Scores for Section '3 THE URBAN ENVIRONMENTS DATASET' (Sections have the entire text in the generated summary):
BLEU Score: 0.01639137916289511
METEOR Score: 0.2536201096413227

In [82]:
# Combine the scores from all dictionaries
combined_scores = {}
for section in bleu_scores_top_ranked.keys():
    combined_scores[section] = {
        "BLEU": bleu_scores_top_ranked.get(section, 0),
        "METEOR": meteor_scores_top_ranked.get(section, 0),
        "ROUGE-1": rouge1_scores_top_ranked.get(section, 0),
        "ROUGE-2": rouge2_scores_top_ranked.get(section, 0),
        "ROUGE-L": rougel_scores_top_ranked.get(section, 0)
    }

# Create a PrettyTable instance
table = PrettyTable()
table.field_names = ["Section with LSA", "BLEU", "METEOR", "ROUGE-1", "ROUGE-2", "ROUGE-L"]

# Populate the table with scores
for section, section_scores in combined_scores.items():
    bleu = "{:.3f}".format(section_scores["BLEU"])
    meteor = "{:.3f}".format(section_scores["METEOR"])
    rouge_1 = "{:.3f}".format(section_scores["ROUGE-1"])
    rouge_2 = "{:.3f}".format(section_scores["ROUGE-2"])
    rouge_l = "{:.3f}".format(section_scores["ROUGE-L"])
    
    table.add_row([section, bleu, meteor, rouge_1, rouge_2, rouge_l])

# Print the table
print(table)

+----------------------------------+-------+--------+---------+---------+---------+
|         Section with LSA         |  BLEU | METEOR | ROUGE-1 | ROUGE-2 | ROUGE-L |
+----------------------------------+-------+--------+---------+---------+---------+
|             ABSTRACT             | 0.060 | 0.401  |  0.415  |  0.202  |  0.361  |
|          1 INTRODUCTION          | 0.019 | 0.315  |  0.252  |  0.080  |  0.181  |
|           2 LITERATURE           | 0.017 | 0.247  |  0.156  |  0.072  |  0.110  |
| 3 THE URBAN ENVIRONMENTS DATASET | 0.016 | 0.254  |  0.185  |  0.086  |  0.126  |
|       4 EXPERIMENTAL SETUP       | 0.014 | 0.268  |  0.202  |  0.063  |  0.145  |
|     5 RESULTS AND DISCUSSION     | 0.008 | 0.170  |  0.075  |  0.029  |  0.047  |
|          6 CONCLUSIONS           | 0.001 | 0.097  |  0.041  |  0.000  |  0.030  |
+----------------------------------+-------+--------+---------+---------+---------+


In [83]:
# Evaluation of LSA applied summaries with reference summary of ChatGPT

# Define reference summaries for each section
reference_summaries = summary_dict

# Function to calculate scores for a section
def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  # Define smoothing function
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
    meteor_score_val = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_val, rouge1, rouge2, rougel

# Example text 2
generated_summary = top_ranked_sentences  # Use top-ranked sentences directly

# Calculate scores for each section using top-ranked sentences
scores_top_ranked = {}

for section, sentences in generated_summary.items():
    section_text = ' '.join(sentences)

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    scores_top_ranked[section] = {
        'BLEU Score': bleu_score,
        'METEOR Score': meteor_score_val,
        'ROUGE-1 F-measure': rouge1,
        'ROUGE-2 F-measure': rouge2,
        'ROUGE-L F-measure': rougel
    }

# Display scores for each section using top-ranked sentences
for section, scores in scores_top_ranked.items():
    print(f"Scores for Section '{section}' (LSA has been applied on the generated summary):")
    for metric, value in scores.items():
        print(f"{metric}: {value}")
    print()

Scores for Section 'ABSTRACT' (LSA has been applied on the generated summary):
BLEU Score: 0.10040237466384212
METEOR Score: 0.35332523972179924
ROUGE-1 F-measure: 0.39069767441860465
ROUGE-2 F-measure: 0.17840375586854462
ROUGE-L F-measure: 0.2046511627906977

Scores for Section '1 INTRODUCTION' (LSA has been applied on the generated summary):
BLEU Score: 0.012679558352619334
METEOR Score: 0.15845843030641457
ROUGE-1 F-measure: 0.23770491803278687
ROUGE-2 F-measure: 0.05785123966942149
ROUGE-L F-measure: 0.11475409836065573

Scores for Section '2 LITERATURE' (LSA has been applied on the generated summary):
BLEU Score: 0.007905500418394633
METEOR Score: 0.1977129531134931
ROUGE-1 F-measure: 0.2222222222222222
ROUGE-2 F-measure: 0.09183673469387756
ROUGE-L F-measure: 0.12121212121212119

Scores for Section '3 THE URBAN ENVIRONMENTS DATASET' (LSA has been applied on the generated summary):
BLEU Score: 0.010595580732023213
METEOR Score: 0.161744997176133
ROUGE-1 F-measure: 0.2587064676616

In [91]:
# Create a PrettyTable instance
evaluation_table = PrettyTable()
evaluation_table.field_names = ["Section with LSA applied text", "BLEU", "METEOR", "ROUGE-1", "ROUGE-2", "ROUGE-L"]

# Populate the table with scores
for section, section_scores in scores_top_ranked.items():
    bleu = "{:.3f}".format(section_scores["BLEU Score"])
    meteor = "{:.3f}".format(section_scores["METEOR Score"])
    rouge_1 = "{:.3f}".format(section_scores["ROUGE-1 F-measure"])
    rouge_2 = "{:.3f}".format(section_scores["ROUGE-2 F-measure"])
    rouge_l = "{:.3f}".format(section_scores["ROUGE-L F-measure"])
    
    evaluation_table.add_row([section, bleu, meteor, rouge_1, rouge_2, rouge_l])

# Print the table
print(evaluation_table)

+----------------------------------+-------+--------+---------+---------+---------+
|  Section with LSA applied text   |  BLEU | METEOR | ROUGE-1 | ROUGE-2 | ROUGE-L |
+----------------------------------+-------+--------+---------+---------+---------+
|             ABSTRACT             | 0.100 | 0.353  |  0.391  |  0.178  |  0.205  |
|          1 INTRODUCTION          | 0.013 | 0.158  |  0.238  |  0.058  |  0.115  |
|           2 LITERATURE           | 0.008 | 0.198  |  0.222  |  0.092  |  0.121  |
| 3 THE URBAN ENVIRONMENTS DATASET | 0.011 | 0.162  |  0.259  |  0.040  |  0.100  |
|       4 EXPERIMENTAL SETUP       | 0.005 | 0.112  |  0.160  |  0.012  |  0.135  |
|     5 RESULTS AND DISCUSSION     | 0.005 | 0.120  |  0.178  |  0.053  |  0.094  |
|          6 CONCLUSIONS           | 0.003 | 0.096  |  0.062  |  0.000  |  0.047  |
+----------------------------------+-------+--------+---------+---------+---------+


##### GLOVE Embedding

In [92]:
!pip install -U spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/42.8 MB 2.3 MB/s eta 0:00:19
     ---------------------------------------- 0.2/42.8 MB 2.2 MB/s eta 0:00:20
     ---------------------------------------- 0.4/42.8 MB 2.9 MB/s eta 0:00:15
      --------------------------------------- 0.7/42.8 MB 3.2 MB/s eta 0:00:14
      --------------------------------------- 0.9/42.8 MB 3.6 MB/s eta 0:00:12
     - -------------------------------------- 1.8/42.8 MB 5.9 MB/s eta 0:00:07
     -- ------------------------------------- 2.4/42.8 MB 6.9 MB/s eta 0:00:06
     --- ------------------------------------ 3.2/42.8 MB 8.3 MB/s eta 0:00:05
     --- ------------------------------------ 3.9/42.8 MB 9.2 MB/s eta 0:00:05
     ---- -------------------------------

### Similarity scores using GLOVE embeddings

In [105]:
# Using Glove for sections with entire text( Query is being matched with the sections but the sections have all the text)

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model with GloVe vectors
nlp = spacy.load("en_core_web_md")

def text_similarity_with_glove(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Use spacy to get GloVe vectors for the processed texts
    vector1 = nlp(text1_processed).vector
    vector2 = nlp(text2_processed).vector

    # Calculate the cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores_glove = {}
for section in sections:
    similarity_score = text_similarity_with_glove(processed_text_final[sections.index(section)], text2)
    similarity_scores_glove[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores_glove.items():
    print(f"Similarity score with Section with entire text and Glove '{section}': {score}")

Similarity score with Section with entire text and Glove 'ABSTRACT': 0.8730466961860657
Similarity score with Section with entire text and Glove '1 INTRODUCTION': 0.8694714903831482
Similarity score with Section with entire text and Glove '2 LITERATURE': 0.8453741073608398
Similarity score with Section with entire text and Glove '3 THE URBAN ENVIRONMENTS DATASET': 0.8471739292144775
Similarity score with Section with entire text and Glove '4 EXPERIMENTAL SETUP': 0.8419233560562134
Similarity score with Section with entire text and Glove '5 RESULTS AND DISCUSSION': 0.8214588761329651
Similarity score with Section with entire text and Glove '6 CONCLUSIONS': 0.8650362491607666


In [106]:
# Using Glove with only top ranked sentences(Matching the query to the sections and LSA has been applied)

# Load the pre-trained model with GloVe vectors
nlp = spacy.load("en_core_web_md")

def text_similarity_with_glove(text1, text2):
    # Ensure that the input is a string
    text1 = str(text1)
    text2 = str(text2)

    # Tokenize and lemmatize the texts, applying lowercasing to individual words
    tokens1 = [word.lower() for word in sent_tokenize(text1)]
    tokens2 = [word.lower() for word in sent_tokenize(text2)]
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens into strings
    text1_processed = ' '.join(tokens1)
    text2_processed = ' '.join(tokens2)

    # Use spacy to get GloVe vectors for the processed texts
    vector1 = nlp(text1_processed).vector
    vector2 = nlp(text2_processed).vector

    # Calculate the cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]

    return similarity

# Example usage:
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Calculate similarity with each section header
similarity_scores_glove = {}
for section, top_sentence in zip(sections, top_ranked_sentences):
    similarity_score = text_similarity_with_glove(top_sentence, text2)
    similarity_scores_glove[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores_glove.items():
    print(f"Similarity score with Section using LSA and Glove '{section}': {score}")

Similarity score with Section using LSA and Glove 'ABSTRACT': 0.5400400757789612
Similarity score with Section using LSA and Glove '1 INTRODUCTION': 0.24933794140815735
Similarity score with Section using LSA and Glove '2 LITERATURE': 0.2600635290145874
Similarity score with Section using LSA and Glove '3 THE URBAN ENVIRONMENTS DATASET': 0.606190025806427
Similarity score with Section using LSA and Glove '4 EXPERIMENTAL SETUP': 0.2615669071674347
Similarity score with Section using LSA and Glove '5 RESULTS AND DISCUSSION': 0.5232053995132446
Similarity score with Section using LSA and Glove '6 CONCLUSIONS': 0.2009713351726532


### Measuring Similarity Scores of user query with Title of pdf

##### GLOVE embedding

In [107]:
import spacy

# Load the spaCy model with GloVe vectors
nlp = spacy.load("en_core_web_md")

# Your text2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Function to calculate similarity score between title and text2 using GloVe vectors
def glove_similarity(title, text2):
    # Ensure that the input is a string
    title = str(title)
    text2 = str(text2)

    # Process the texts using spaCy
    doc_title = nlp(title)
    doc_text2 = nlp(text2)

    # Calculate the similarity between doc_title and doc_text2
    similarity = doc_title.similarity(doc_text2)

    return similarity

# Calculate similarity between clean_title and text2
similarity_score = glove_similarity(clean_title, text2)

# Display similarity score
print(f"Similarity score with Title '{clean_title}' using GloVe: {similarity_score}")

Similarity score with Title 'using convolutional networks satellite imagery identify pa/t_terns urban environments large scale' using GloVe: 0.66612272316635


##### TF-IDF embedding

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_md")

# Your text2
text2 = 'Neural Machine Translation; Attention Mechanism, Language Translation.'

# Function to calculate similarity score between title and text2 using TF-IDF
def tfidf_similarity(title, text2):
    # Ensure that the input is a string
    title = str(title)
    text2 = str(text2)

    # Process the texts using spaCy
    doc_title = nlp(title)
    doc_text2 = nlp(text2)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector_title = vectorizer.fit_transform([title])
    vector_text2 = vectorizer.transform([text2])

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector_title, vector_text2)[0][0]

    return similarity

# Calculate similarity between clean_title and text2 using TF-IDF
similarity_score_tfidf = tfidf_similarity(clean_title, text2)

# Display similarity score
print(f"Similarity score with Title '{clean_title}' using TF-IDF: {similarity_score_tfidf}")

Similarity score with Title 'using convolutional networks satellite imagery identify pa/t_terns urban environments large scale' using TF-IDF: 0.0


### Using BART for Summarization

In [109]:
from transformers import BartTokenizer, BartForConditionalGeneration
import spacy

# Load spaCy English model with word embeddings
nlp = spacy.load('en_core_web_md')

# Load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def extract_sentences_from_sections(sections):
    section_sentences = {}

    for idx, text in enumerate(sections):
        # Use spaCy to extract sentences
        doc = nlp(text)

        # Extract individual sentences
        sentences = [sent.text for sent in doc.sents]

        # Store sentences in the dictionary
        section_sentences[f"Section {idx + 1}"] = sentences

    return section_sentences

def extract_section_name(section_text):
    section_name = section_text.split('\n')[0].strip()
    return section_name

def summarize_sentences(sentences, max_summary_length=1000):
    # Concatenate selected sentences
    concatenated_text = ' '.join(sentences)

    # Encode and generate summary
    inputs = tokenizer.encode("summarize: " + concatenated_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_summary_length, min_length=int(max_summary_length/5),
                                 length_penalty=10.0, num_beams=4, early_stopping=True)

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def select_top_sentences(section_sentences, num_top_sentences=5):
    # Sort sentences by some criterion (e.g., semantic similarity, importance, etc.)
    # Here, we simply sort by length for demonstration purposes
    sorted_sentences = sorted(section_sentences, key=len, reverse=True)

    # Select the top sentences
    top_sentences = sorted_sentences[:num_top_sentences]

    return top_sentences

# Extract sentences from sections
section_sentences = extract_sentences_from_sections(section_extraction)

# Generate summaries for each section based on semantic similarity
section_summaries = {}
for section, sentences in section_sentences.items():
    # Extract the section name using the heuristic
    section_name = extract_section_name('\n'.join(sentences))

    # Select the top sentences based on some criterion
    top_sentences = select_top_sentences(sentences, num_top_sentences=5)

    # Generate summary for the selected sentences
    section_summary = summarize_sentences(top_sentences)

    # Store the section summary
    section_summaries[section_name] = section_summary

    # Print the section summary with the section name
    print(f"Summary for {section_name}:\n{section_summary}\n")


Summary for ABSTRACT:
We analyze pa/t_terns in land use in urban neighborhoods using large-scale satellite imagery data and computer vision techniques based on deep convolutional neuralnetworks. We use this data to train and compare deep architectures which have recently shown good per-formance on standard computer vision tasks (image classi/f_ication and segmentation), including on geospatial data. CCS CONCEPTS: Computer vision; Neural net-agicallyworks; Applied computing Environmental sciences;KEYWORDS: Land use, satellite imagery,convolutional networks, land use class i/f/ication, convolutionian networks. The paper is based on the Urban Atlas land use dataset of 20 land use classes across 300 European cities. For more information, visit the CCS ConcepTS website. The study was published in the open-source journal, The Open Data Project (ODP) (http://www.opendataproject.org/doi/full/10.1177/1556/1555/1/2/2.1).

Summary for 1 INTRODUCTION:
The methods presented here allow for automated

In [110]:
# Define your query
query = "Neural Machine Translation; Attention Mechanism, Language Translation."

# Calculate similarity with each section header
similarity_scores = {}
for section, summary in section_summaries.items():
    # Calculate similarity score between the summary and the query
    similarity_score = text_similarity(summary, query)

    # Store the similarity score for each section
    similarity_scores[section] = similarity_score

# Display similarity scores
for section, score in similarity_scores.items():
    print(f"Similarity score with '{section}' using summaries from BART: {score}")


Similarity score with 'ABSTRACT' using summaries from BART: 0.059549133417541374
Similarity score with '1 INTRODUCTION' using summaries from BART: 0.0
Similarity score with '2 LITERATURE' using summaries from BART: 0.05735393346764046
Similarity score with '3 THE URBAN ENVIRONMENTS DATASET' using summaries from BART: 0.0
Similarity score with '4 EXPERIMENTAL SETUP' using summaries from BART: 0.0
Similarity score with '5 RESULTS AND DISCUSSION' using summaries from BART: 0.12171612389003696
Similarity score with '6 CONCLUSIONS' using summaries from BART: 0.0


In [111]:
# Calculate the average similarity score
total_similarity_score = sum(similarity_scores.values())
average_similarity_score = total_similarity_score / len(similarity_scores)

print(f"Average similarity score for the query '{query}': {average_similarity_score}")

Average similarity score for the query 'Neural Machine Translation; Attention Mechanism, Language Translation.': 0.034088455825031255


In [112]:
# Calculate the average similarity score for BART summaries
average_similarity_bart = sum(similarity_scores.values()) / len(similarity_scores)

# Table for similarity score of sections with top ranked sentences
table_combined = PrettyTable()
table_combined.field_names = ["Section", "Similarity Score (BART)"]

# Add similarity scores from BART summaries to the table
for section, score in similarity_scores.items():
    table_combined.add_row([section, f"{score:.6f}"])

# Add the average similarity score for BART summaries to the table
table_combined.add_row(["Average Similarity (BART)", f"{average_similarity_bart:.6f}"])

# Print the table
print(table_combined)

+----------------------------------+-------------------------+
|             Section              | Similarity Score (BART) |
+----------------------------------+-------------------------+
|             ABSTRACT             |         0.059549        |
|          1 INTRODUCTION          |         0.000000        |
|           2 LITERATURE           |         0.057354        |
| 3 THE URBAN ENVIRONMENTS DATASET |         0.000000        |
|       4 EXPERIMENTAL SETUP       |         0.000000        |
|     5 RESULTS AND DISCUSSION     |         0.121716        |
|          6 CONCLUSIONS           |         0.000000        |
|    Average Similarity (BART)     |         0.034088        |
+----------------------------------+-------------------------+


In [113]:
# Define reference summaries for each section
reference_summaries = summary_dict

# Update reference_summaries keys to match the format in generated_summaries_bart
updated_reference_summaries = {f"Section {idx + 1}": summary for idx, summary in enumerate(reference_summaries.values())}

# Function to calculate scores for a section
def calculate_scores_for_section(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    smoothing_function = SmoothingFunction().method1  # Define smoothing function
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
    meteor_score_val = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougel = rouge_scores['rougeL'].fmeasure

    return bleu_score, meteor_score_val, rouge1, rouge2, rougel

# Example text 2
generated_summary = section_summaries  # Use section summaries generated by BART

# Calculate scores for each section using BART-generated summaries
scores_bart_generated = {}

for section, summary in generated_summary.items():
    section_text = summary  # Use the summary directly

    # Use the corresponding reference summary for each section
    reference_summary_for_section = reference_summaries[section]

    bleu_score, meteor_score_val, rouge1, rouge2, rougel = calculate_scores_for_section(reference_summary_for_section, section_text)

    scores_bart_generated[section] = {
        'BLEU Score': bleu_score,
        'METEOR Score': meteor_score_val,
        'ROUGE-1 F-measure': rouge1,
        'ROUGE-2 F-measure': rouge2,
        'ROUGE-L F-measure': rougel
    }

# Display scores for each section using BART-generated summaries
for section, scores in scores_bart_generated.items():
    print(f"Scores for Section '{section}' (BART-Generated Summaries):")
    for metric, value in scores.items():
        print(f"{metric}: {value}")
    print()


Scores for Section 'ABSTRACT' (BART-Generated Summaries):
BLEU Score: 0.053867343892231835
METEOR Score: 0.2955224974575262
ROUGE-1 F-measure: 0.32653061224489793
ROUGE-2 F-measure: 0.11522633744855966
ROUGE-L F-measure: 0.19591836734693877

Scores for Section '1 INTRODUCTION' (BART-Generated Summaries):
BLEU Score: 0.010092669246207891
METEOR Score: 0.17663883735312308
ROUGE-1 F-measure: 0.22560975609756098
ROUGE-2 F-measure: 0.012269938650306749
ROUGE-L F-measure: 0.11585365853658537

Scores for Section '2 LITERATURE' (BART-Generated Summaries):
BLEU Score: 0.033258349877348985
METEOR Score: 0.26694227917969493
ROUGE-1 F-measure: 0.3356643356643356
ROUGE-2 F-measure: 0.04225352112676056
ROUGE-L F-measure: 0.14685314685314688

Scores for Section '3 THE URBAN ENVIRONMENTS DATASET' (BART-Generated Summaries):
BLEU Score: 0.03526036620620024
METEOR Score: 0.21540139018131008
ROUGE-1 F-measure: 0.32081911262798635
ROUGE-2 F-measure: 0.061855670103092786
ROUGE-L F-measure: 0.17064846416382

In [114]:
# Create a PrettyTable instance
evaluation_table = PrettyTable()
evaluation_table.field_names = ["Section with BART applied text", "BLEU", "METEOR", "ROUGE-1", "ROUGE-2", "ROUGE-L"]

# Populate the table with scores
for section, section_scores in scores_bart_generated.items():
    bleu = "{:.3f}".format(section_scores["BLEU Score"])
    meteor = "{:.3f}".format(section_scores["METEOR Score"])
    rouge_1 = "{:.3f}".format(section_scores["ROUGE-1 F-measure"])
    rouge_2 = "{:.3f}".format(section_scores["ROUGE-2 F-measure"])
    rouge_l = "{:.3f}".format(section_scores["ROUGE-L F-measure"])
    
    evaluation_table.add_row([section, bleu, meteor, rouge_1, rouge_2, rouge_l])

# Print the table
print(evaluation_table)

+----------------------------------+-------+--------+---------+---------+---------+
|  Section with BART applied text  |  BLEU | METEOR | ROUGE-1 | ROUGE-2 | ROUGE-L |
+----------------------------------+-------+--------+---------+---------+---------+
|             ABSTRACT             | 0.054 | 0.296  |  0.327  |  0.115  |  0.196  |
|          1 INTRODUCTION          | 0.010 | 0.177  |  0.226  |  0.012  |  0.116  |
|           2 LITERATURE           | 0.033 | 0.267  |  0.336  |  0.042  |  0.147  |
| 3 THE URBAN ENVIRONMENTS DATASET | 0.035 | 0.215  |  0.321  |  0.062  |  0.171  |
|       4 EXPERIMENTAL SETUP       | 0.058 | 0.324  |  0.430  |  0.119  |  0.252  |
|     5 RESULTS AND DISCUSSION     | 0.008 | 0.153  |  0.206  |  0.024  |  0.103  |
|          6 CONCLUSIONS           | 0.005 | 0.225  |  0.160  |  0.022  |  0.096  |
+----------------------------------+-------+--------+---------+---------+---------+


In [115]:
# Calculate average similarity score for top-ranked sentences of LSA
average_similarity_top_ranked = sum(similarity_scores_top_ranked.values()) / len(similarity_scores_top_ranked)

# Table for similarity score of sections with top ranked sentences of LSA
table_top_ranked = PrettyTable()
table_top_ranked.field_names = ["Section", "Similarity Score"]

for section, score in similarity_scores_top_ranked.items():
    table_top_ranked.add_row([section, f"{score:.6f}"])

# Add average similarity score for top-ranked sentences of LSA to the table
table_top_ranked.add_row(["Average Similarity (LSA)", f"{average_similarity_top_ranked:.6f}"])

# Calculate average similarity score for BART summaries
average_similarity_bart = sum(similarity_scores.values()) / len(similarity_scores)

# Table for similarity score of sections with BART summaries
table_combined = PrettyTable()
table_combined.field_names = ["Section", "Similarity Score (BART)"]

# Add similarity scores from BART summaries to the table
for section, score in similarity_scores.items():
    table_combined.add_row([section, f"{score:.6f}"])

# Add the average similarity score for BART summaries to the table
table_combined.add_row(["Average Similarity (BART)", f"{average_similarity_bart:.6f}"])

# Print the tables
print(table_top_ranked)
print("\n")  # Separate the tables with a newline
print(table_combined)

# Write tables to CSV file
with open('lsa_scores.csv', 'w') as file:
    file.write(str(table_top_ranked))
    file.write("\n\n")  # Add a newline between the tables
    file.write(str(table_combined))


+----------------------------------+------------------+
|             Section              | Similarity Score |
+----------------------------------+------------------+
|             ABSTRACT             |     0.076472     |
|          1 INTRODUCTION          |     0.000000     |
|           2 LITERATURE           |     0.144338     |
| 3 THE URBAN ENVIRONMENTS DATASET |     0.000000     |
|       4 EXPERIMENTAL SETUP       |     0.000000     |
|     5 RESULTS AND DISCUSSION     |     0.000000     |
|          6 CONCLUSIONS           |     0.000000     |
|     Average Similarity (LSA)     |     0.031544     |
+----------------------------------+------------------+


+----------------------------------+-------------------------+
|             Section              | Similarity Score (BART) |
+----------------------------------+-------------------------+
|             ABSTRACT             |         0.059549        |
|          1 INTRODUCTION          |         0.000000        |
|          

### Converting the similarity output into a Python package, Convert and show into CSV

In [103]:
pip install openpyxl




In [116]:
import openpyxl
from openpyxl import Workbook
import os

def save_scores_to_excel(csv_file_path, similarity_scores_top_ranked_lsa, similarity_scores_bart):
    try:
        # Extract base name of the input file
        file_name = os.path.basename(pdf_path)
        file_name_without_extension = os.path.splitext(file_name)[0]
        new_sheet_name = f"{file_name_without_extension}_output"

        # Load existing Excel file or create a new one
        try:
            wb = openpyxl.load_workbook(csv_file_path)
        except FileNotFoundError:
            wb = Workbook()
            # Remove the default 'Sheet' if it exists
            default_sheet = wb['Sheet']
            wb.remove(default_sheet)
        
        # Create a new sheet for the current data
        ws = wb.create_sheet(title = new_sheet_name)
        
        # Write the LSA similarity scores to the worksheet
        ws.append(['Table for LSA'])
        ws.append(['Section', 'Similarity Score(LSA)'])
        ws.append(['Title',f"{similarity_score_tfidf:.6f}"])
        for section, score in similarity_scores_top_ranked.items():
            ws.append([section, f"{score:.6f}"])
        ws.append(["Average Similarity (LSA)", f"{similarity_scores_top_ranked_lsa:.6f}"])
        ws.append([])
        
        # Write the BART similarity scores to the worksheet
        ws.append(['Table for BART summaries'])
        ws.append(['Section', 'Similarity Score (BART)'])
    
        for section, score in similarity_scores.items():
            ws.append([section, f"{score:.6f}"])
        ws.append(["Average Similarity (BART)", f"{similarity_scores_bart:.6f}"])
        
        # Save the Excel file
        wb.save(csv_file_path)
        
        print(f"Data saved successfully to a new sheet '{new_sheet_name}' in the Excel file.")
        
    except PermissionError:
        print(f"Permission denied: Unable to save Excel file '{csv_file_path}'. Please check your permissions.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

csv_file_path = 'output.xlsx'
save_scores_to_excel(csv_file_path, average_similarity_top_ranked, average_similarity_bart)

Data saved successfully to a new sheet 'satellite_imagery_output' in the Excel file.
