# Text Loading

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
loader = TextLoader("test_data-nvidia_news.txt")
loader

<langchain_community.document_loaders.text.TextLoader at 0x1a797923190>

In [3]:
# Checking if the loader is properly loading the test data
data = loader.load()
data



In [4]:
data[0].metadata

{'source': 'test_data-nvidia_news.txt'}

In [5]:
# Testing the possiblity of using CSV to load data
from langchain.document_loaders.csv_loader import CSVLoader

loader_csv = CSVLoader("sample.csv")
data_csv = loader_csv.load()
len(data_csv)

9

In [6]:
data_csv[1].metadata

{'source': 'sample.csv', 'row': 1}

In [7]:
# Testing if we can change the meta data when dealing with CSVs
loader_csv = CSVLoader("sample.csv", source_column="title")
data_csv = loader_csv.load()
data_csv[1].metadata

{'source': 'Doctor Strange in the Multiverse of Madness', 'row': 1}

In [8]:
# Seeing if we can directly load data from a URL
from langchain.document_loaders import UnstructuredURLLoader

urls = [
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
]

loader_urls = UnstructuredURLLoader(urls=urls)
data_urls = loader_urls.load()
len(data_urls)

2

In [9]:
data_urls

[Document(metadata={'source': 'https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nNetwork 18\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_BUSINESS_AS/MC_ENG_ROS_NWS_BUS_AS_ATF_728\n\nMoneycontrol\n\nGo PRO@₹1/dayPRO\n\nMoneycontrol PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessBanksHDFC Bank re-appoints Sanmoy Chakrabarti as Chief 

# Text Splitting, Merging and Overlapping

In [10]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)

Created a chunk of size 419, which is longer than the specified 200
Created a chunk of size 502, which is longer than the specified 200
Created a chunk of size 274, which is longer than the specified 200
Created a chunk of size 415, which is longer than the specified 200
Created a chunk of size 219, which is longer than the specified 200
Created a chunk of size 367, which is longer than the specified 200
Created a chunk of size 284, which is longer than the specified 200
Created a chunk of size 239, which is longer than the specified 200
Created a chunk of size 296, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200


In [11]:
splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)

Created a chunk of size 212, which is longer than the specified 200
Created a chunk of size 269, which is longer than the specified 200
Created a chunk of size 377, which is longer than the specified 200
Created a chunk of size 231, which is longer than the specified 200
Created a chunk of size 259, which is longer than the specified 200
Created a chunk of size 245, which is longer than the specified 200


In [39]:
splitter = CharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(data[0].page_content)



In [12]:
# Since CharacterTextSplitter is unable to effectively split the data withoutcreating chunks larger than specified
# using RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20
)

chunks = splitter.split_text(data[0].page_content)

In [13]:
all(len(chunk) <= 200 for chunk in chunks)

True

# Vector Database

In [14]:
import pandas as pd

dataframe = pd.read_csv("sample.csv")
dataframe

Unnamed: 0,movie_id,title,industry,release_year,imdb_rating,studio,language_id,budget,revenue,unit,currency
0,101,K.G.F: Chapter 2,Bollywood,2022,8.4,Hombale Films,3,1,12.5,Billions,INR
1,102,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,5,200,954.8,Millions,USD
2,103,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,5,165,644.8,Millions,USD
3,104,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,5,180,854,Millions,USD
4,105,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,5,250,670,Millions,USD
5,106,Sholay,Bollywood,1975,8.1,United Producers,1,Not Available,Not Available,Not Available,Not Available
6,107,Dilwale Dulhania Le Jayenge,Bollywood,1995,8.0,Yash Raj Films,1,400,2000,Millions,INR
7,108,3 Idiots,Bollywood,2009,8.4,Vinod Chopra Films,1,550,4000,Millions,INR
8,109,Kabhi Khushi Kabhie Gham,Bollywood,2001,7.4,Dharma Productions,1,390,1360,Millions,INR


In [15]:
# Encoding test data for vector database
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(dataframe.title)
vectors.shape

  from .autonotebook import tqdm as notebook_tqdm


(9, 768)

In [16]:
# Constructing a local vector database using faiss
import faiss

index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

In [17]:
query = "darkness"
search_vector = encoder.encode(query)
search_vector.shape

(768,)

In [18]:
import numpy as np

svec = np.array(search_vector).reshape(1, -1)
index.search(svec, k = 3)

(array([[1.2229491, 1.4725052, 1.4774926]], dtype=float32), array([[2, 3, 4]]))

# Answer Retrieval

In [None]:
import os
import langchain
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS

from dotenv import load_dotenv

In [50]:
# Loading Google API Key
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.environ.get("GENAI_API_KEY")

# Initializing embdeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initializing LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17",
    temperature=0,
    max_retries=2,
)

In [None]:
# Retriving news information form the web
urls = [
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
]

loader = UnstructuredURLLoader(urls=urls)
data = loader.load()
len(data)

2

In [None]:
# Splitting the data into chunks 
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Since data is of type documents directly using split_documents over split_text
chunks = splitter.split_documents(data)

In [46]:
chunks[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS')

In [51]:
# Creating vector index
vector_index = FAISS.from_documents(chunks, embeddings)

# Creating retrieval chain
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_index.as_retriever())

# Printing the retrieval object
chain



In [None]:
# Test query for the input articles from web urls
test_query = "What is the price of Tiago ICNG?"

# Toggling debug mode to check result generation
langchain.debug = False

# Printing reply
chain({"question": test_query})

{'question': 'What is the price of Tiago ICNG?',
 'answer': 'The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\n',
 'sources': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}