# Document Loaders

In [8]:
import os
def prepare_filesdir_path():
    return os.path.join("resources", "no-code-files")

def prepare_resource_path(filename):
    files_dir = prepare_filesdir_path()
    return os.path.join(files_dir, filename)

def prepare_pdf_path(filename):
    files_dir = prepare_filesdir_path()
    return os.path.join(files_dir, "pdfs", filename)

def format_mupdf_str_date(date_str):
    from datetime import datetime
    timestamp = date_str[2:]    # removing D: from date_str

    timestamp = __remove_zone_info_if_available(timestamp)
    timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
    formatted_date = timestamp.strftime("%Y-%m-%d %H-%M-%S")
    return formatted_date

def __remove_zone_info_if_available(date_str):
    dash_index = date_str.find('-')
    if dash_index != -1:
        return date_str[:dash_index]
    else:
        return date_str
        


### TextLoader

In [90]:
from langchain_community.document_loaders import TextLoader

In [14]:
loader = TextLoader("loader-functionality-to-focus-on.md")
document = loader.load()
print(document)

[Document(metadata={'source': 'loader-functionality-to-focus-on.md'}, page_content='# For each document loader in LangChain, you should focus on the following aspects:\n\n### Supported File Types:\nIdentify what file formats the loader can handle (e.g., PDFs, Word docs, CSVs).\n\n### Configuration Options:\nLearn about the customizable parameters or settings available for each loader (e.g., handling large files, extracting metadata, pagination).\n\n### Parsing and Extraction:\nUnderstand how the loader extracts text or data from the document and how it handles complex structures like tables, images, or embedded files.\n\n### Efficiency and Performance:\nExplore how the loader manages memory, speed of loading, and processing large documents or datasets.\n\n### Integration with Other Tools:\nCheck how the loader integrates with other tools or services, such as cloud storage, databases, or web APIs.\n\n### Error Handling:\nInvestigate how the loader deals with corrupted files, missing dat

In [23]:
len(document[0].page_content.split('\n\n'))

10

In [28]:
# Let's try loading a txt with it
loader = TextLoader('requirements.txt')
loader.load()

[Document(metadata={'source': 'requirements.txt'}, page_content='ipykernel')]

In [29]:
loader = TextLoader('file')
loader.load()

[Document(metadata={'source': 'file'}, page_content='how are you. This is a file with no extension.')]

In [31]:
loader = TextLoader(prepare_resource_path('11-quote.txt'))
loader.load()

[Document(metadata={'source': 'resources\\no-code-files\\11-quote.txt'}, page_content='Never Say Tomorrow.\nDo it today.')]

In [33]:
loader = TextLoader(prepare_resource_path('09-50-quotes.txt'))
docs = loader.lazy_load()

for chunk in docs:
    print(chunk)

page_content='1. "If you want to achieve greatness stop asking for permission." --Anonymous 2. "Things work out best for those who make the best of how things work out." --John Wooden 3. "To live a creative life, we must lose our fear of being wrong." --Anonymous 4. "If you are not willing to risk the usual you will have to settle for the ordinary." --Jim Rohn 5. "Trust because you are willing to accept the risk, not because it's safe or certain." --Anonymous 6. "Take up one idea. Make that one idea your life--think of it, dream of it, live on that idea. Let the 
brain, muscles, nerves, every part of your body, be full of that idea, and just leave every other idea 
alone. This is the way to success." --Swami Vivekananda 7. "All our dreams can come true if we have the courage to pursue them." --Walt Disney 8. "Good things come to people who wait, but better things come to those who go out and get them." --
Anonymous 9. "If you do what you always did, you will get what you always got." -

- This loader can only load a .txt, .md, no-extension file containing text.
- This can't load any pdf, docx, csv, png file.
- In the metadata, it gives source: filepath only.

### CSV Loader

#### CSVLoader

In [107]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(prepare_resource_path("07-quote-dateset.csv"), source_column='author')
docs = loader.load()

In [92]:
# each row is treated as a separate document
len(docs)

75966

In [103]:
docs[0].page_content

"quote: Age is an issue of mind over matter. If you don't mind, it doesn't matter.\nauthor: Mark Twain"

In [99]:
type(docs[0].page_content)

str

In [105]:
docs[0].page_content.find('quote')

0

In [102]:
docs[0].metadata

{'source': 'Mark Twain', 'row': 0}

- This loader loads each rows as a separate document.
- If required, it updates the source of each document (row) to the defined column. So it'll use the corresponding entry for each row from that column.

#### UnstructuredCSVLoader

In [108]:
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader

# not studied right now.

### PDF Loader

#### PyPDFLoader

In [119]:
from langchain_community.document_loaders.pdf import PyPDFLoader

loader = PyPDFLoader(prepare_resource_path("01-motivational-quotes.pdf"))
docs = loader.load()

In [125]:
docs[0].metadata

{'source': 'resources\\no-code-files\\01-motivational-quotes.pdf', 'page': 0}

In [124]:
docs[0].page_content

'100 Motivational Quotes That Will Inspire You to Succeed  \nEveryone needs some inspiration, and these motivational quotes will give you the edge you \nneed to create your success. So read on and let them inspire you . \nBy Lolly Daskal  \nAs leaders,  managers, and bosses, we must realize that everything we think actually matters. If we are \nseeking success, we must think successful, inspiring, and motivating thoughts.  \nRead on to find the words of wisdom that will motivate you in building your business, leading your life , \ncreating success,  achieving your goals, and overcoming your fears.   Here are quotes —100 of them —that \nwill inspire you r success.   \n1. "If you want to achieve greatness  stop asking for permission." --Anonymous  \n2. "Things work out best for those who make the best of how things work out." --John Wooden  \n3. "To live a creative life, we must lose  our fear of being wrong." --Anonymous  \n4. "If you are not willing to risk the usual you will have to s

In [130]:
# trying with a pdf having images and columns
loader = PyPDFLoader(prepare_resource_path("02-quotes-with-mix-format.pdf"), extract_images=True)
docs = loader.load()

In [131]:
docs[0].page_content

'Slow and s teady wins the race.  \n \nNever give up.  \nIts about the decis ive moment.  \nPerfectionis\nnotattainable,\nbutifwechase\nperfection we can\ncatch excellence.\nVINCELOMBARDI\nBRIANTRACY'

In [132]:
docs[1].page_content

' \n \nPrepare for the real life.  \n \nTable -quote -1 Table -quote -2 \nTable -quote -3 Table -quote -4 \n \nIfyoucamchange\nyourmind,you\ncamchange\nyour lhios,\n=- %9[LL1] 3LAAB8Iifeiswvhat\nhappenstous\nwhileweare\ninalkingother\nplaus."It\'sthe possibility\nofhavinga dream\ncometruethat\nmakeslife\ninteresting.\nPAULO COEHLO\nTOBAT'

- This loader treats each page as separate document.
- Also gives page_no is metadata
- It sometimes add extra spaces betweeb text e.g between a word chars etc.
- Uses RapidOCR-Runtime (a DL-based) library to extract text from images when extract_images=True.
- This does extracts text form images and  handles table. Extraction is somewhat good but not perfect. Handling tables is not perfect also.

#### PDFMinerLoader

In [14]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(prepare_pdf_path("07-contract.pdf"))
pages = loader.load()

In [15]:
pages[0].page_content

'1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\nEQUIPMENT TAGGING LEGEND\n\nFURNACE HPU\n\nRE-BCO201-MCC01\n\nCHECK VALVE\n\nFILTER\n\nCOMPONENT NUMBER\n(NUMERIC [2 DIGIT])\n\nCOMPONENT DESCRIPTION FOR SUB EQUIPMENT\n\nEQUIPMENT NUMBER DESIGNATION\n(NUMERIC [3 OR 4 DIGIT])\n\nEQUIPMENT NUMBER DESIGNATION\n\nPROCESS AREA\n\nFIXED DISPLACEMENT PUMP\n\nVARIABLE DISPLACEMENT,\nPRESSURE COMPENSATED PUMP\n\nBREATHER / FILTER\n\nHEAT EXCHANGER (COOLER)\n\nIMMERSION HEATER\n\nCYLINDER\n\nPRESSURE RELIEF VALVE\n\nDIFFERENTIAL AREA RELIEF VALVE\n\nFLOW CONTROL VALVE\n\n3-WAY BYPASS VALVE\n\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nFLOAT CENTER\n\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nTANDEM CENTER, OPEN CROSS-OVER PORTING\n\nDIRECTIONAL AIR VALVE, 4-WAY, SINGLE SOLENOID,\n2-POSITION, AIR RETURN, SPRING ASSISTED\n\nPILOT OPERATED\nDUAL CHECK VALVES\n\nA\n\nB\n\nC\n\nD\n\nE\n\nF\n\n         - PRELIMINARY -\nNOT FOR CONSTRUCTION\n\nA\n\nREV\n\nISSUED FOR DESIGN\n\nN. IMEL\nDESIGN BY\n\nT. HULL\nCHECKED 

In [18]:
pages[0].page_content[pages[0].page_content.find("ISSUED"):]

'ISSUED FOR DESIGN\n\nN. IMEL\nDESIGN BY\n\nT. HULL\nCHECKED BY\n\n07-05-23\nDATE\n\nTHIS DRAWING IS PROPERTY OF ENVIVA, INC. AND IS NOT TO BE\nREPRODUCED, COPIED OR USED FOR ANY PURPOSE OTHER\nTHAN CONSTRUCTION OF THIS PROJECT WITHOUT WRITTEN\nCONSENT OF ENVIVA, INC.\n\nENVIVA\n\nEPES WOOD PELLET FACILITY\n\nI\n\nWE IT\n\nK\n\nS\nI\nNC\n\nE\n\n4\n\n8 8\n\n1\n\n®\n\nCOMMON PLANT SYSTEMS\nP&ID LEGEND SHEET 7 OF 7\nP&ID\n\nENGINEER/DESIGN\nORIGINATOR\n\nT. HULL\n\nLEAD ENG\n\nENG MGR\n\nPROJ MGR\n\nC. HERETH\n\nE. SKIBBE\n\nR. MCNIFF\n\nDRAWING NUMBER\n\n00-01-D-009\n\n\x0c'

- extracts even the minute details (from images) e.g. quote, author, publisher from a blurry image with detail like newline etc.
- Handles tables well i.e. both the textual and image tables. Just don't provide some key etc. to indicate table. However uses \n etc.
- Handles multiple columns one-by-one.
- Beautifully Extracts details from large files.
- Awesomely extracts details from contracts (tried drawing.)

- Can't handle a pdf of images.
- Can't handle Urdu.


#### PyMuPDF Loader

##### Test 1: Simple text pdf

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(prepare_pdf_path("01-motivational-quotes.pdf"))
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\01-motivational-quotes.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\01-motivational-quotes.pdf', 'page': 0, 'total_pages': 6, 'format': 'PDF 1.5', 'title': '', 'author': 'Larry', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2010', 'producer': 'Microsoft® Word 2010', 'creationDate': "D:20141117092635-06'00'", 'modDate': "D:20141117092635-06'00'", 'trapped': ''}, page_content='100 Motivational Quotes That Will Inspire You to Succeed \nEveryone needs some inspiration, and these motivational quotes will give you the edge you \nneed to create your success. So read on and let them inspire you. \nBy Lolly Daskal \nAs leaders, managers, and bosses, we must realize that everything we think actually matters. If we are \nseeking success, we must think successful, inspiring, and motivating thoughts. \nRead on to find the words of wisdom that will motivate you in building your business, leading your life, \ncreating

In [4]:
docs[1].metadata

{'source': 'resources\\no-code-files\\pdfs\\01-motivational-quotes.pdf',
 'file_path': 'resources\\no-code-files\\pdfs\\01-motivational-quotes.pdf',
 'page': 1,
 'total_pages': 6,
 'format': 'PDF 1.5',
 'title': '',
 'author': 'Larry',
 'subject': '',
 'keywords': '',
 'creator': 'Microsoft® Word 2010',
 'producer': 'Microsoft® Word 2010',
 'creationDate': "D:20141117092635-06'00'",
 'modDate': "D:20141117092635-06'00'",
 'trapped': ''}

In [9]:
format_mupdf_str_date(docs[1].metadata['creationDate'])

'2014-11-17 09-26-35'

In [10]:
docs[1].page_content

'17. "I have not failed. I\'ve just found 10,000 ways that won\'t work." --Thomas A. Edison \n18. "If you don\'t value your time, neither will others. Stop giving away your time and talents--start \ncharging for it." --Kim Garst \n19. "A successful man is one who can lay a firm foundation with the bricks others have thrown at him." -\n-David Brinkley \n20. "No one can make you feel inferior without your consent." --Eleanor Roosevelt \n21. "The whole secret of a successful life is to find out what is one\'s destiny to do, and then do it." --\nHenry Ford \n22. "If you\'re going through hell keep going." --Winston Churchill \n23. "The ones who are crazy enough to think they can change the world, are the ones who do." --\nAnonymous \n24. "Don\'t raise your voice, improve your argument." --Anonymous \n25. "What seems to us as bitter trials are often blessings in disguise." --Oscar Wilde \n26. "The meaning of life is to find your gift. The purpose of life is to give it away." --Anonymous \n2

##### Test 2: pdf having table with images

In [13]:
# try for other pdfs
loader = PyMuPDFLoader(prepare_pdf_path("02-quotes-with-mix-format.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\02-quotes-with-mix-format.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\02-quotes-with-mix-format.pdf', 'page': 0, 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'Shaukat ali khan', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word LTSC', 'producer': 'Microsoft® Word LTSC', 'creationDate': "D:20240825093443+05'00'", 'modDate': "D:20240825093443+05'00'", 'trapped': ''}, page_content='Slow and steady wins the race. \n \nNever give up. \nIts about the decisive moment. \nPerfection is\nnotattainable,\nbutifwechase\nperfection we can\ncatch\n excellence.\nVINCE LOMBARDI\nBRIANTRACY'),
 Document(metadata={'source': 'resources\\no-code-files\\pdfs\\02-quotes-with-mix-format.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\02-quotes-with-mix-format.pdf', 'page': 1, 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'Shaukat ali khan', 'subject': '', 'keywords': '', 'creator': 'Microsoft® 

##### Test 3: pdf having table

In [14]:
loader = PyMuPDFLoader(prepare_pdf_path("03-petition-images.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\03-petition-images.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\03-petition-images.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.7', 'title': 'Hearing before learned Member, Federal Land Commission on 27-08-024', 'author': 'CamScanner', 'subject': 'Hearing before learned Member, Federal Land Commission on 27-08-024', 'keywords': '', 'creator': '', 'producer': 'intsig.com pdf producer', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='CamScanner\ncSU.M.S\nCOMMISSION,ISLAMABAD\nApp. No.564-564/M/FLC/2024\nIslamabad, the 5th August, 2024\nFrom:\nReader to Member\nFederal Land Commission\nIslamabad.\nTo\nThe Province of Punjab through Secretary (Colonies),\nLRS\nPunjab, Lahore.\nThe Senior Member/Chief Land Commissioner, Punjab,\nLahore.\nAudiucnal Ceniuty Commissione\nThe Additional Deputy Commissioner (Revenue)\n(Revenue)Altock\nDeputy Land Commissioner,\nAttock.\nTitle:\nAPP. NO.564/M/FLC/2024 - 

In [18]:
loader = PyMuPDFLoader(prepare_pdf_path("05-bank-challan.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\05-bank-challan.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\05-bank-challan.pdf', 'page': 0, 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Crystal Reports', 'producer': 'Powered By Crystal', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='To Be Filled in By the Remitter\nTo Be Filled in By the Remitter\nHead of \nAccount\nOrder of \nBank\nTo Be Filled in By \nthe Department \nOfficer or the \nTreasury\nAmount\nTerritory\nName or \ndesignation \nand address of \nthe Person on \nwhose behalf \nmoney is Paid\nProvisional/ Centre\nTreasury/Sub treasury\n(Cashier in service centre)\nTo Be Filled in By \nthe Department \nOfficer or the \nTreasury\nAmount\nOrder of \nBank\nHead of \nAccount\nTerritory\nName or \ndesignation \nand address \nof the Person \non whose \nbehalf money \nis Paid\nProvisional/ Centre\nTreasury/Sub treasury\n(Cashier in

In [19]:
loader = PyMuPDFLoader(prepare_pdf_path("06-land-fard.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\06-land-fard.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\06-land-fard.pdf', 'page': 0, 'total_pages': 10, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Crystal Reports', 'producer': 'Powered By Crystal', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='ورق ن\nسل\nض\nت\nطف/پ\nمل\nی کہ\nھٹی\nتھ6پی گھیبب\nی\nاٹ2019-20\n۱\n۲\n۳\n۳\n۳\n۳\n۳\n۴\n۵\n۶\n۷\n۸\n۹\n۱۰\n ن کھیبوٹٹ\n مل\n وٹن\nن کھیھ\nکشر\n   م ل\nکائ\nن مم کشر م احال\n  ھ\nن مم مل م ن مم وال/خون قم     اور سن نع حق مل\nراہ مت\n مک کت م مص\nح\n مص رق مب\nح\n ن6زرع نسمس ٹ\n/\n1\n10\n رق و ق زم ک وار اور\nوٹن کشر و کھیبوٹٹ \nمان کھیھ\nمل\nکب ن\nک م\nن مم ندار (اگ کئ ہ)- 1\n   شح مل زم- 2\n مل م تت\nمل و حب\n لن ج کشر\nادا کنھم ہ شح \nاور مل\n وسئ آبش\nم ن مم چہ راجہ \nوغہ\n)ب  (م مدی\nن رج حاران زم\nنارد\n ن خہ م ن مم\nاگ کئ ہ   \n)(ال)(ج)(د)(ہ)(و\nOMMKLNNMOBAGHMOLKOMMK\nKPKKKKKECIANGNJCKKPKK\nKMIKCFHJIE

In [21]:
loader = PyMuPDFLoader(prepare_pdf_path("07-contract.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\07-contract.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\07-contract.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'PyPDF2', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='E\nN\nC\nW\nE\nI\nT\nK\nI\n4\n1\n8\n8\nI\nS\n®\nA\nB\nC\nD\nE\nF\n9\n8\n7\n6\n5\n4\n3\n2\n1\nENVIVA\nEPES WOOD PELLET FACILITY\nENGINEER/DESIGN\nORIGINATOR\nLEAD ENG\nENG MGR\nPROJ MGR\nDRAWING NUMBER\nT. HULL\n07-05-23\nN. IMEL\nISSUED FOR DESIGN\nA\nDESIGN BY\nCHECKED BY\nREV\nDATE\nTHIS DRAWING IS PROPERTY OF ENVIVA, INC. AND IS NOT TO BE\nREPRODUCED, COPIED OR USED FOR ANY PURPOSE OTHER\nTHAN CONSTRUCTION OF THIS PROJECT WITHOUT WRITTEN\nCONSENT OF ENVIVA, INC.\nE. SKIBBE\nC. HERETH\nT. HULL\nR. MCNIFF\nCOMMON PLANT SYSTEMS\nP&ID LEGEND SHEET 7 OF 7\nP&ID\n00-01-D-009\n         - PRELIMINARY -\nNOT FOR CONSTRUCTION\nPROCESS AREA\nCOM

In [None]:
loader = PyMuPDFLoader(prepare_pdf_path("08-font-styles.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\08-font-styles.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\08-font-styles.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shaukat ali khan', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word LTSC', 'producer': 'Microsoft® Word LTSC', 'creationDate': "D:20240825164928+05'00'", 'modDate': "D:20240825164928+05'00'", 'trapped': ''}, page_content='This is a pdf with \nMultile font styles \nAnd formats \n')]

In [16]:
loader = PyMuPDFLoader(prepare_pdf_path("09-stamp.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\09-stamp.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\09-stamp.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Shaukat ali khan', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word LTSC', 'producer': 'Microsoft® Word LTSC', 'creationDate': "D:20240825165003+05'00'", 'modDate': "D:20240825165003+05'00'", 'trapped': ''}, page_content=' \nMMR\nReceived\n11/10/2023\nMRM')]

In [17]:
loader = PyMuPDFLoader(prepare_pdf_path("10-table.pdf"), extract_images=True)
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\10-table.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\10-table.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.6', 'title': '', 'author': 'Mary', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 9.0 for Word', 'producer': 'Adobe PDF Library 9.0', 'creationDate': "D:20110123144232-05'00'", 'modDate': "D:20140304212414-05'00'", 'trapped': ''}, page_content='Example table \nThis is an example of a data table. \nDisability \nCategory \nParticipants \nBallots \nCompleted \nBallots \nIncomplete/ \nTerminated \nResults \nAccuracy \nTime to \ncomplete \nBlind \n5 \n1 \n4 \n34.5%, n=1 \n1199 sec, n=1 \nLow Vision \n5 \n2 \n3 \n98.3% n=2 \n(97.7%, n=3) \n1716 sec, n=3 \n(1934 sec, n=2) \nDexterity \n5 \n4 \n1 \n98.3%, n=4 \n1672.1 sec, n=4 \nMobility \n3 \n3 \n0 \n95.4%, n=3 \n1416 sec, n=3 \n \n')]

- Provides more info related to the pdf file.
- Works well for tables (extract values as str.)
- Extracts text from images well like PdfMiner. Use this when pdf file details are also required. Takes more time than PdfMiner.
- Sometimes provide title of the pdf when it contains images containing text. Good for extracting text from pdfs containing images.
- Processes challan well. Don't provide positional information but required values can be extracted from the provided text.

- If pdf had image and table on top, then this is not recommended. Using PdfMiner is a better option.
- Don't provide information regarding the font styles.

#### UnstructuredPDFLoader

In [45]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(prepare_pdf_path("01-motivational-quotes.pdf"))
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\01-motivational-quotes.pdf'}, page_content='100 Motivational Quotes That Will Inspire You to Succeed\n\nEveryone needs some inspiration, and these motivational quotes will give you the edge you need to create your success. So read on and let them inspire you.\n\nBy Lolly Daskal\n\nAs leaders, managers, and bosses, we must realize that everything we think actually matters. If we are seeking success, we must think successful, inspiring, and motivating thoughts.\n\nRead on to find the words of wisdom that will motivate you in building your business, leading your life, creating success, achieving your goals, and overcoming your fears. Here are quotes—100 of them—that will inspire your success.\n\n1. "If you want to achieve greatness stop asking for permission." --Anonymous\n\n2. "Things work out best for those who make the best of how things work out." --John Wooden\n\n3. "To live a creative life, we must lose our fear of being

#### PDFPlumber

In [31]:
from langchain_community.document_loaders import PDFPlumberLoader

loader = PDFPlumberLoader(prepare_pdf_path("07-contract.pdf"))
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\07-contract.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\07-contract.pdf', 'page': 0, 'total_pages': 1, 'Producer': 'PyPDF2'}, page_content='1 2 3 4 5 6 7 8 9\nEQUIPMENT TAGGING LEGEND FURNACE HPU\nRE-BCO201-MCC01\nCHECK VALVE FILTER\nFIXED DISPLACEMENT PUMP\nCOMPONENT NUMBER\n(NUMERIC [2 DIGIT])\nPRESSURE RELIEF VALVE\nCOMPONENT DESCRIPTION FOR SUB EQUIPMENT A\nBREATHER / FILTER\nEQUIPMENT NUMBER DESIGNATION\n(NUMERIC [3 OR 4 DIGIT])\nVARIABLE DISPLACEMENT,\nEQUIPMENT NUMBER DESIGNATION PRESSURE COMPENSATED PUMP\nDIFFERENTIAL AREA RELIEF VALVE\nPROCESS AREA\nHEAT EXCHANGER (COOLER)\nFLOW CONTROL VALVE\nIMMERSION HEATER\n3-WAY BYPASS VALVE\nB\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nFLOAT CENTER\nCYLINDER\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nTANDEM CENTER, OPEN CROSS-OVER PORTING\nDIRECTIONAL AIR VALVE, 4-WAY, SINGLE SOLENOID,\n2-POSITION, AIR RETURN, SPRING ASSISTED\nC\nPILOT OPERATED\nDUAL CHECK VALVES\n- PRELIMINA

In [32]:
docs[0].metadata

{'source': 'resources\\no-code-files\\pdfs\\07-contract.pdf',
 'file_path': 'resources\\no-code-files\\pdfs\\07-contract.pdf',
 'page': 0,
 'total_pages': 1,
 'Producer': 'PyPDF2'}

In [33]:
docs[0].page_content

'1 2 3 4 5 6 7 8 9\nEQUIPMENT TAGGING LEGEND FURNACE HPU\nRE-BCO201-MCC01\nCHECK VALVE FILTER\nFIXED DISPLACEMENT PUMP\nCOMPONENT NUMBER\n(NUMERIC [2 DIGIT])\nPRESSURE RELIEF VALVE\nCOMPONENT DESCRIPTION FOR SUB EQUIPMENT A\nBREATHER / FILTER\nEQUIPMENT NUMBER DESIGNATION\n(NUMERIC [3 OR 4 DIGIT])\nVARIABLE DISPLACEMENT,\nEQUIPMENT NUMBER DESIGNATION PRESSURE COMPENSATED PUMP\nDIFFERENTIAL AREA RELIEF VALVE\nPROCESS AREA\nHEAT EXCHANGER (COOLER)\nFLOW CONTROL VALVE\nIMMERSION HEATER\n3-WAY BYPASS VALVE\nB\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nFLOAT CENTER\nCYLINDER\nDIRECTIONAL VALVE, 4-WAY, 3-POSITION\nTANDEM CENTER, OPEN CROSS-OVER PORTING\nDIRECTIONAL AIR VALVE, 4-WAY, SINGLE SOLENOID,\n2-POSITION, AIR RETURN, SPRING ASSISTED\nC\nPILOT OPERATED\nDUAL CHECK VALVES\n- PRELIMINARY -\nNOT FOR CONSTRUCTION\nD\nISSUED FOR DESIGN\nA\nN. IMEL T. HULL 07-05-23\nREV DESIGN BY CHECKED BY DATE\nTHIS DRAWING IS PROPERTY OF ENVIVA, INC. AND IS NOT TO BE\nREPRODUCED, COPIED OR USED FOR ANY PURPOS

In [34]:
loader = PDFPlumberLoader(prepare_pdf_path("08-font-styles.pdf"))
loader.load()

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\08-font-styles.pdf', 'file_path': 'resources\\no-code-files\\pdfs\\08-font-styles.pdf', 'page': 0, 'total_pages': 1, 'Author': 'Shaukat ali khan', 'Creator': 'Microsoft® Word LTSC', 'CreationDate': "D:20240825164928+05'00'", 'ModDate': "D:20240825164928+05'00'", 'Producer': 'Microsoft® Word LTSC'}, page_content='This is a pdf with\nMultile font styles\nAnd formats\n')]

#### Trying to extract voterlist

In [48]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(prepare_pdf_path("voter-list.pdf"), extract_images=True)
docs = loader.load()
docs

ValueError: cannot reshape array of size 481216 into shape (2336,1648,newaxis)

- Handles tabular information more effectively.
- No info. about font.

#### PDFium2

In [39]:
from langchain_community.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(prepare_pdf_path("05-bank-challan.pdf"))
docs = loader.load()
docs



[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\05-bank-challan.pdf', 'page': 0}, page_content='To Be Filled in By the Remitter To Be Filled in By the Remitter\r\nHead of \r\nAccount\r\nOrder of \r\nBank\r\nTo Be Filled in By \r\nthe Department \r\nOfficer or the \r\nTreasury\r\nTerritory Amount\r\nName or \r\ndesignation \r\nand address of \r\nthe Person on \r\nwhose behalf \r\nmoney is Paid\r\nProvisional/ Centre\r\nTreasury/Sub treasury\r\n(Cashier in service centre)\r\nTo Be Filled in By \r\nthe Department \r\nOfficer or the \r\nTreasury\r\nAmount\r\nOrder of \r\nBank\r\nHead of \r\nAccount\r\nTerritory\r\nName or \r\ndesignation \r\nand address \r\nof the Person \r\non whose \r\nbehalf money \r\nis Paid\r\nProvisional/ Centre\r\nTreasury/Sub treasury\r\n(Cashier in service centre)\r\nHead of \r\nAccount\r\nOrder of \r\nBank\r\nName or \r\ndesignation \r\nand address \r\nof the Person \r\non whose \r\nbehalf money \r\nis Paid\r\nTerritory\r\nTo Be Filled in By the Re

In [40]:
docs[0].page_content

'To Be Filled in By the Remitter To Be Filled in By the Remitter\r\nHead of \r\nAccount\r\nOrder of \r\nBank\r\nTo Be Filled in By \r\nthe Department \r\nOfficer or the \r\nTreasury\r\nTerritory Amount\r\nName or \r\ndesignation \r\nand address of \r\nthe Person on \r\nwhose behalf \r\nmoney is Paid\r\nProvisional/ Centre\r\nTreasury/Sub treasury\r\n(Cashier in service centre)\r\nTo Be Filled in By \r\nthe Department \r\nOfficer or the \r\nTreasury\r\nAmount\r\nOrder of \r\nBank\r\nHead of \r\nAccount\r\nTerritory\r\nName or \r\ndesignation \r\nand address \r\nof the Person \r\non whose \r\nbehalf money \r\nis Paid\r\nProvisional/ Centre\r\nTreasury/Sub treasury\r\n(Cashier in service centre)\r\nHead of \r\nAccount\r\nOrder of \r\nBank\r\nName or \r\ndesignation \r\nand address \r\nof the Person \r\non whose \r\nbehalf money \r\nis Paid\r\nTerritory\r\nTo Be Filled in By the Remitter\r\nAmount\r\nTo Be Filled in By \r\nthe Department \r\nOfficer or the \r\nTreasury\r\nChallan Form No.3

- Not that good at pdf image table processing

### Check for Urdu and Other languages
All explored loaders are not best for urdu.

In [42]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(prepare_pdf_path("urdu.pdf"))
docs = loader.load()
docs

[Document(metadata={'source': 'resources\\no-code-files\\pdfs\\urdu.pdf'}, page_content='  ہیگ یکریڈفر لہلمو جرا(/ˈheɪɡəl/;[19] منجر: [ˈɡe:ɔɐ̯ k ˈvɪlhɛlm ˈfʁi:dʁɪç ˈhe:ɡəl]; فلسفی منجر یکا )1831  ,14 مبرنو  – 1770  ,27 گستا   منجر  روا\n\nلیتمثا عظمیابر  روا اہو رمشہو بہت میں رود پنےا یہ ۔تھا شخص ہما کی تییاتجز  روا فلسفہ یتاور ۔اہو بتثا ثرابا بھی میں یتاور[20]   بیمغر لیکن ،ہے ہتار شخصیت عزمتنا یکا ہیگ چہگرا\n\nفلسفہ  ۔ہے ہشد تسلیم متقا و قد علمی کی سا میں \n\n\x0c')]

In [43]:
docs[0].page_content

'  ہیگ یکریڈفر لہلمو جرا(/ˈheɪɡəl/;[19] منجر: [ˈɡe:ɔɐ̯ k ˈvɪlhɛlm ˈfʁi:dʁɪç ˈhe:ɡəl]; فلسفی منجر یکا )1831  ,14 مبرنو  – 1770  ,27 گستا   منجر  روا\n\nلیتمثا عظمیابر  روا اہو رمشہو بہت میں رود پنےا یہ ۔تھا شخص ہما کی تییاتجز  روا فلسفہ یتاور ۔اہو بتثا ثرابا بھی میں یتاور[20]   بیمغر لیکن ،ہے ہتار شخصیت عزمتنا یکا ہیگ چہگرا\n\nفلسفہ  ۔ہے ہشد تسلیم متقا و قد علمی کی سا میں \n\n\x0c'