# Text Loading

In [3]:
from langchain.document_loaders import TextLoader

In [None]:
loader = TextLoader("test_data-nvidia_news.txt")
loader

<langchain_community.document_loaders.text.TextLoader at 0x242e8aaf640>

In [10]:
# Checking if the loader is properly loading the test data
data = loader.load()
data



In [14]:
data[0].metadata

{'source': 'test_data-nvidia_news.txt'}

In [None]:
# Testing the possiblity of using CSV to load data
from langchain.document_loaders.csv_loader import CSVLoader

loader_csv = CSVLoader("sample.csv")
data_csv = loader_csv.load()
len(data_csv)

9

In [20]:
data_csv[1].metadata

{'source': 'sample.csv', 'row': 1}

In [None]:
# Testing if we can change the meta data when dealing with CSVs
loader_csv = CSVLoader("sample.csv", source_column="title")
data_csv = loader_csv.load()
data_csv[1].metadata

{'source': 'Doctor Strange in the Multiverse of Madness', 'row': 1}

In [25]:
# Seeing if we can directly load data from a URL
from langchain.document_loaders import UnstructuredURLLoader

urls = [
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
]

loader_urls = UnstructuredURLLoader(urls=urls)
data_urls = loader_urls.load()
len(data_urls)

2

In [24]:
data_urls

[Document(metadata={'source': 'https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nNetwork 18\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_BUSINESS_AS/MC_ENG_ROS_NWS_BUS_AS_ATF_728\n\nMoneycontrol\n\nGo PRO@₹1/dayPRO\n\nMoneycontrol PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessBanksHDFC Bank re-appoints Sanmoy Chakrabarti as Chief 

# Text Splitting, Merging and Overlapping

In [34]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)



In [None]:
splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)



In [39]:
splitter = CharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)



In [None]:
# Since CharacterTextSplitter is unable to effectively split the data withoutcreating chunks larger than specified
# using RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20
)

chunks = splitter.split_text(data[0].page_content)

In [47]:
all(len(chunk) <= 200 for chunk in chunks)

True

# Vector Database