In [None]:
import pypdf

In [None]:
def clean_texts(pdf_texts:str, substrings_to_remove:list[str] = [], delete_list:list[int] = []) -> list[str]:
    """
    Removes specified substrings from each text in pdf_texts.
    
    Parameters:
    - pdf_texts (list of str): List of text strings to clean.
    - substrings_to_remove (list of str): List of substrings to remove from each text.
    
    Returns:
    - list of str: Cleaned list of text strings.
    """
    cleaned_texts = []
    for text in pdf_texts:
        for substring in substrings_to_remove:
            text = text.replace(substring, "")
        cleaned_texts.append(text)
    
    # Sort delete_list in descending order and remove indices in one pass
    if delete_list:
        for index in sorted(delete_list, reverse=True):
            if 0 <= index < len(cleaned_texts):
                del cleaned_texts[index]
            
    return cleaned_texts

In [None]:
reader = pypdf.PdfReader("data/SocialScience1_Eng_1.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "Social Science IX\n",
    "Social Science I",
    "1234567890",
    "\x01",
    "..",
    "Standard\n",
    "□",
    "1212121 2123456789\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1212121 2123456789\n1212121 2123456789\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1 23456789012121212 12345678 9\n1212121 2123456789\n"
]

ss1 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3,5])
ss1

In [None]:
reader = pypdf.PdfReader("data/SocialScience1_Eng_2.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "Social Science IX\nStandard\n",
    "Social Science I X\nStandard\n",
    "\uf034",
    "\x01",
    "1234567890",
    "23456789012121212",
    "12345678",
    "9\n1",
    "□",
    "\n1 9",
    "1212121 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n12121212\n12121212 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n12121212\n",
    "1212121 2                                                                                                                                                                                                                                                                                        21212122121212                                                                                                                                                                                                        21212129\n"]
l = len(pdf_texts)
ss2 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3,5,l-2,l-3,l-4,l-5,l-6])
ss2

In [None]:
reader = pypdf.PdfReader("data\SocialScience2_Eng_1.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "Standard X\n",
    "\nSocial Science II\n",
    "\uf034",
    "\x01",
    ".."
]
l = len(pdf_texts)
ss2 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3,5, l-1, l-2, l-3, l-4, l-5])
ss2

In [None]:
reader = pypdf.PdfReader("data\SS2_English_2.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "\nStandard - X\n",
    "Social science - II\n",
    "\nSocial Science II\n",
    "St andard X\n",
    "\uf034",
    "\x01",
    ".."
]
l = len(pdf_texts)
ss2 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3,5, l-1, l-2, l-3, l-4, l-5, l-6, l-7])
ss2

In [None]:
reader = pypdf.PdfReader("data/SCERT Kerala State Syllabus 10th Standard English Textbooks Part 1.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "Std X\n",
    "\nEnglish\n",
    "__",
    ".."
]
l = len(pdf_texts)
ss2 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3])
ss2

In [None]:
reader = pypdf.PdfReader("data/SCERT Kerala State Syllabus 10th Standard English Textbooks Part 2.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]  # Filter out empty text
substrings_to_remove = [
    "Std X\n",
    "English\n",
    "__",
    ".."
]
l = len(pdf_texts)
ss2 = clean_texts(pdf_texts, substrings_to_remove, delete_list=[0,1,2,3])
ss2

In [64]:
ss2

['CONTENTS\nUnit IV Flights of  Fancy 111 - 141\nThe Scholarship Jacket (Short Story) 113\n Poetry (Poem) 124\n The Never Never Nest (One-Act Play) 129\nUnit V Ray of Hope 142 - 176\nVanka (Short Story) 144\n Mother to Son (Poem) 157\nThe Castaway (Short Story) 162',
 "• Right to freedom of speech and\nexpression.\n• Right to life and liberty.\n• Right to maximum survival and\ndevelopment.\n• Right to be respected and accepted\nregardless of caste, creed and colour.\n• Right to protection and care against\nphysical, mental and sexual abuse.\n• Right to participation.\n• Protection from child labour and\nhazardous work.\n• Protection against child marriage.\n• Right to know one’s culture and live\naccordingly.\n• Protection against neglect.\n• Right to free and compulsory\neducation.\n• Right to learn, rest and leisure.\n• Right to parental and societal care,\nand protection.\nMajor Responsibilities\n• Protect school and public facilities.\n• Observe punctuality in learning\nand activit

In [65]:
"\n\n".join(ss2)

