# pdf_reader.py

In [1]:
# Importing libraries
import os
import re
import PyPDF2
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

path = os.environ['pdf_path']
path

'D:/SD/Projects/ILAB-Doc-Chatbot/data/ReAL_Plan.pdf'

In [13]:
# Reading pdf

# Extracting Table of Contents
def extract_ToC(pdf_pth, page):

    with open(pdf_pth, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        toc_entries = []

        toc_page = pdf_reader.pages[page]
        text = toc_page.extract_text()
            
        toc_lines = text.splitlines()

        for i in toc_lines:
            toc_entries.append(i)
        
        return toc_entries
    
pdf_pth = path
toc = extract_ToC(pdf_pth, 5)
toc

['Contents',
 'FOREWORD  1',
 'ABBREVIATIONS 2',
 'Recovery and Accelerated Learning (ReAL)  5',
 'in Schools in Nepal 5',
 'Nepal’s Context 5',
 'Challenges 8',
 'Opportunities 9',
 'The Road Map for Recovery and Accelerated Learning 11',
 'Background 11',
 'Goal 12',
 'Objectives 12',
 'Strategies 12',
 '1. Comprehensive assessment of student’s learning level and system’s capacity.  12',
 '2. Redefining measurable “learning skills” and the pedagogy 13',
 '3. Strategy for Learning Recovery 16',
 '4. Strategies for Accelerated Learning 19',
 '5. Implementation Strategies 19',
 '6. Implementation Arrangements 25',
 'Estimated Cost of ReAL Plan for First Phase of Activities 27',
 'Tentative Schedule – Plan of activities 29',
 '4',
 'Recovery and Accelerated Learning (ReAL) Plan (2023-2028) | ']

In [14]:
# Removing unnecessary contents
toc = toc[:-2]
toc

['Contents',
 'FOREWORD  1',
 'ABBREVIATIONS 2',
 'Recovery and Accelerated Learning (ReAL)  5',
 'in Schools in Nepal 5',
 'Nepal’s Context 5',
 'Challenges 8',
 'Opportunities 9',
 'The Road Map for Recovery and Accelerated Learning 11',
 'Background 11',
 'Goal 12',
 'Objectives 12',
 'Strategies 12',
 '1. Comprehensive assessment of student’s learning level and system’s capacity.  12',
 '2. Redefining measurable “learning skills” and the pedagogy 13',
 '3. Strategy for Learning Recovery 16',
 '4. Strategies for Accelerated Learning 19',
 '5. Implementation Strategies 19',
 '6. Implementation Arrangements 25',
 'Estimated Cost of ReAL Plan for First Phase of Activities 27',
 'Tentative Schedule – Plan of activities 29']

In [15]:
import re

# Parsing table of contents
def parse_toc(toc_ls):
    toc_entries = []
    toc_pattern = re.compile(r'^(.*?)\s+(\d+)\s*$')

    for entry in toc_ls:
        match = toc_pattern.match(entry)
        if match:
            topic = re.sub(r'\.{2,}', '', match.group(1)).strip()
            page = int(match.group(2))
            toc_entries.append((topic, page))
    
    return toc_entries

toc_entries = parse_toc(toc)
toc_entries

[('FOREWORD', 1),
 ('ABBREVIATIONS', 2),
 ('Recovery and Accelerated Learning (ReAL)', 5),
 ('in Schools in Nepal', 5),
 ('Nepal’s Context', 5),
 ('Challenges', 8),
 ('Opportunities', 9),
 ('The Road Map for Recovery and Accelerated Learning', 11),
 ('Background', 11),
 ('Goal', 12),
 ('Objectives', 12),
 ('Strategies', 12),
 ('1. Comprehensive assessment of student’s learning level and system’s capacity.',
  12),
 ('2. Redefining measurable “learning skills” and the pedagogy', 13),
 ('3. Strategy for Learning Recovery', 16),
 ('4. Strategies for Accelerated Learning', 19),
 ('5. Implementation Strategies', 19),
 ('6. Implementation Arrangements', 25),
 ('Estimated Cost of ReAL Plan for First Phase of Activities', 27),
 ('Tentative Schedule – Plan of activities', 29)]

# text_preprocessing.py

In [1]:
from pdf_reader import toc_entries

# Extracting technical requirements from PDF 
toc_entries

[('FOREWORD', 1),
 ('ABBREVIATIONS', 2),
 ('Recovery and Accelerated Learning (ReAL)', 5),
 ('in Schools in Nepal', 5),
 ('Nepal’s Context', 5),
 ('Challenges', 8),
 ('Opportunities', 9),
 ('The Road Map for Recovery and Accelerated Learning', 11),
 ('Background', 11),
 ('Goal', 12),
 ('Objectives', 12),
 ('Strategies', 12),
 ('1. Comprehensive assessment of student’s learning level and system’s capacity.',
  12),
 ('2. Redefining measurable “learning skills” and the pedagogy', 13),
 ('3. Strategy for Learning Recovery', 16),
 ('4. Strategies for Accelerated Learning', 19),
 ('5. Implementation Strategies', 19),
 ('6. Implementation Arrangements', 25),
 ('Estimated Cost of ReAL Plan for First Phase of Activities', 27),
 ('Tentative Schedule – Plan of activities', 29)]