### Document Loaders in LangChain

##### TextLoader

In [19]:
from langchain.document_loaders import TextLoader

In [20]:
from langchain.document_loaders import TextLoader

# Corrected file path using raw string
loader = TextLoader("nvda_news_1.txt")
data = loader.load()
print(data)




In [21]:
type(loader)

langchain_community.document_loaders.text.TextLoader

In [22]:
loader.file_path

'nvda_news_1.txt'

### CSV Loader

In [23]:
from langchain.document_loaders.csv_loader import CSVLoader

In [24]:
loader = CSVLoader(file_path="Top Indian Places to Visit.csv")
data = loader.load()
data

[Document(metadata={'source': 'Top Indian Places to Visit.csv', 'row': 0}, page_content=': 0\nZone: Northern\nState: Delhi\nCity: Delhi\nName: India Gate\nType: War Memorial\nEstablishment Year: 1921\ntime needed to visit in hrs: 0.5\nGoogle review rating: 4.6\nEntrance Fee in INR: 0\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 2.6\nBest Time to visit: Evening'),
 Document(metadata={'source': 'Top Indian Places to Visit.csv', 'row': 1}, page_content=": 1\nZone: Northern\nState: Delhi\nCity: Delhi\nName: Humayun's Tomb\nType: Tomb\nEstablishment Year: 1572\ntime needed to visit in hrs: 2.0\nGoogle review rating: 4.5\nEntrance Fee in INR: 30\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 0.4\nBest Time to visit: Afternoon"),
 Document(metadata={'source': 'Top Indian Places to Visit.csv', 'row': 2}, page_content=': 2\nZone: N

In [25]:
data[0]

Document(metadata={'source': 'Top Indian Places to Visit.csv', 'row': 0}, page_content=': 0\nZone: Northern\nState: Delhi\nCity: Delhi\nName: India Gate\nType: War Memorial\nEstablishment Year: 1921\ntime needed to visit in hrs: 0.5\nGoogle review rating: 4.6\nEntrance Fee in INR: 0\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 2.6\nBest Time to visit: Evening')

In [26]:
data[0].metadata

{'source': 'Top Indian Places to Visit.csv', 'row': 0}

In [27]:
loader = CSVLoader(file_path="Top Indian Places to Visit.csv", source_column="Best Time to visit")
data = loader.load()
data

[Document(metadata={'source': 'Evening', 'row': 0}, page_content=': 0\nZone: Northern\nState: Delhi\nCity: Delhi\nName: India Gate\nType: War Memorial\nEstablishment Year: 1921\ntime needed to visit in hrs: 0.5\nGoogle review rating: 4.6\nEntrance Fee in INR: 0\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 2.6\nBest Time to visit: Evening'),
 Document(metadata={'source': 'Afternoon', 'row': 1}, page_content=": 1\nZone: Northern\nState: Delhi\nCity: Delhi\nName: Humayun's Tomb\nType: Tomb\nEstablishment Year: 1572\ntime needed to visit in hrs: 2.0\nGoogle review rating: 4.5\nEntrance Fee in INR: 30\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 0.4\nBest Time to visit: Afternoon"),
 Document(metadata={'source': 'Afternoon', 'row': 2}, page_content=': 2\nZone: Northern\nState: Delhi\nCity: Delhi\nName: Akshardham Temple\nType

In [28]:
data[0].page_content

': 0\nZone: Northern\nState: Delhi\nCity: Delhi\nName: India Gate\nType: War Memorial\nEstablishment Year: 1921\ntime needed to visit in hrs: 0.5\nGoogle review rating: 4.6\nEntrance Fee in INR: 0\nAirport with 50km Radius: Yes\nWeekly Off: None\nSignificance: Historical\nDSLR Allowed: Yes\nNumber of google review in lakhs: 2.6\nBest Time to visit: Evening'

In [29]:
data[0].metadata

{'source': 'Evening', 'row': 0}

### UnstructuredURLLoader
UnstructuredURLLoader of Langchain internally uses unstructured python library to load the content from url's

In [30]:
#installing necessary libraries, libmagic is used for file type detection
!pip3 install unstructured libmagic python-magic python-magic-bin



In [31]:
from langchain.document_loaders import UnstructuredURLLoader

In [32]:
loader=UnstructuredURLLoader(
    urls = [
        "https://www.moneycontrol.com/news/business/economy/indias-formal-employment-had-a-better-showing-in-the-first-half-of-fy25-12875554.html",
        "https://www.happyinshape.com/cruiseship/?utm_campaign=9gncksdb&utm_source=Taboola&utm_medium=native&utm_term=network18media-moneycontrolenglish&utm_content=dISSQ0VrYxVwW_TSHF_MqJ5PKjbvZun94ANdBMWk_Hs=&tclid=GiDalu84T1UyOLoeY4EtIOB_dcQJp5lElOmDPOwyAz0jISCy92YoocWojtLAoczHATCrpFs#"])

In [33]:
data=loader.load()
len(data)

2

In [34]:
data[0].page_content[0:1000]

'English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹15 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_ECONOMY_AS/MC_ENG_ROS_NWS_ECO_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessEconomyIndia’s formal employment had a better showing in the first half of FY25\n\nTrending Topics\n\nNTPC Green Energy Share PriceNTPC Green Stock PricePine Labs IPOAdani BondsC2C Advanced Systems IPO GMP\n\nIndia’s formal employment had a better showing in the first half of FY2

In [35]:
data[1].page_content[0:1000]

'Cruise Ship Encounters Pirates – But What One Brave Passenger Does Next Stuns Them All!\n\nBy Kaius August 23, 2024\n\nNathan had been enjoying his time on the Ocean Delight, sipping a piña colada while chatting with a few men at the bar. His wife, Samantha, was taking a gardening class on board, a part of the special itinerary for their 30th anniversary.\n\nAs Nathan leaned on the railing, his gaze drifting over the endless ocean, something caught his eye that made him curious. Three small boats were speeding toward the ship. At first, he dismissed them as ordinary fishermen, but as they drew closer, a sense of unease gripped him.\n\nNathan’s instincts, honed from years of military service, kicked in.“All hands on deck!” he shouted, his voice carrying across the deck as passengers and crew got startled. Pirates, armed and ready for confrontation, were approaching fast. But Nathan had a plan forming in his mind—one that could turn the tables on these attackers.\n\nNathan was a retired

In [36]:
data[0].metadata

{'source': 'https://www.moneycontrol.com/news/business/economy/indias-formal-employment-had-a-better-showing-in-the-first-half-of-fy25-12875554.html'}

In [37]:
data[1].metadata

{'source': 'https://www.happyinshape.com/cruiseship/?utm_campaign=9gncksdb&utm_source=Taboola&utm_medium=native&utm_term=network18media-moneycontrolenglish&utm_content=dISSQ0VrYxVwW_TSHF_MqJ5PKjbvZun94ANdBMWk_Hs=&tclid=GiDalu84T1UyOLoeY4EtIOB_dcQJp5lElOmDPOwyAz0jISCy92YoocWojtLAoczHATCrpFs#'}

### Text Splitters

In [50]:
# Taking some random text from wikipedia

text = """Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. 
Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.

Interstellar premiered in Los Angeles on October 26, 2014. In the United States, it was first released on film stock, expanding to venues using digital projectors. The film received generally positive reviews from critics and grossed over $677 million worldwide ($715 million after subsequent re-releases), making it the tenth-highest-grossing film of 2014. 
It has been praised by astronomers for its scientific accuracy and portrayal of theoretical astrophysics.[5][6][7] Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades."""

#### Manual approach of splitting the text into chunks

In [51]:
# Say LLM token limit is 100, in that case we can do simple thing such as this

text[0:100]

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher N'

In [56]:
# Well but we want complete words and want to do this for entire text, may be we can use Python's split funciton

words = text.split(" ")
len(words)

264

In [58]:
chunks=[]

s=""
for word in words:
    s += word +" "
    if len(s) >200:
        chunks.append(s)
        s=""
chunks.append(s)        

In [61]:
chunks[:2]

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt ',
 'Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in ']

**Splitting data into chunks can be done in native python but it is a tidious process. Also if necessary,  may need to experiment with various delimiters in an iterative manner to ensure that each chunk does not exceed the token length limit of the respective LLM.**

**Langchain provides a better way through text splitter classes.**

In [None]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)

In [62]:
from langchain.text_splitter import CharacterTextSplitter


In [66]:
splitter=CharacterTextSplitter(
      separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)

In [70]:
chunks=splitter.split_text(text)
len(chunks)

Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 358, which is longer than the specified 200


9

In [71]:
len(chunks[0])

105

In [68]:
chunks

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.',
 'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.',
 'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg.',
 'Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.',
 'Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and t

In [74]:
for chunk in chunks:
    print(len(chunk))

105
120
210
181
197
207
128
357
253


*As you can see, all though we gave 200 as a chunk size since the split was based on \n, it ended up creating chunks that are bigger than size 200.*



*Another class from Langchain can be used to recursively split the text based on a list of separators. This class is RecursiveTextSplitter. Let's see how it works*

### RecursiveTextSplitter

In [75]:
text

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n\nBrothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. \nKip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. \nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place 

In [78]:
from langchain.text_splitter import RecursiveCharacterTextSplitter



In [79]:
r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", " "])
    chunk_size = 200,  # size of each chunk created
    chunk_overlap  = 0,  # size of  overlap between chunks in order to maintain the context
    length_function = len  # Function to calculate size, currently we are using "len" which denotes length of string however you can pass any token counter)
)

In [81]:
chunks=r_splitter.split_text(text)


for chunk in chunks:
    print(len(chunk))

105
120
199
10
181
197
198
8
128
191
165
198
54


### Let's understand how exactly it formed these chunks

In [86]:
first_split=text.split("\n\n")[0]
first_split

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'

In [84]:
len(first_split)

439

*Recursive text splitter uses a list of separators, i.e. separators = ["\n\n", "\n", "."]*

*So now it will first split using \n\n and then if the resulting chunk size is greater than the chunk_size parameter which is 200 in our case, then it will use the next separator which is \n.*

In [87]:
second_split=first_split.split("\n")
second_split

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. ',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. ',
 'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.']

In [88]:
for split in second_split:
    print(len(split))

106
121
210


*Third split exceeds chunk size 200. Now it will further try to split that using the third separator which is ' ' (space)*

In [89]:
second_split[2]

'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'