In [21]:
pip install -U langchain-community

Note: you may need to restart the kernel to use updated packages.


In [22]:
from langchain.document_loaders import TextLoader

In [23]:
from langchain.document_loaders import UnstructuredURLLoader

In [24]:
!pip3 install unstructured libmagic python-magic python-magic-bin



In [32]:
loader = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
     "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loader.load()
len(data)

2

In [33]:
data[0].metadata

{'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}

In [27]:
from langchain.text_splitter import CharacterTextSplitter


In [15]:
text="""It may measure less than 50 square miles and have a population that doesn’t even crack a million, but San Francisco justly ranks as one of the greatest cities in the world. \n Famous for grand-dame Victorians, cable cars, a dynamic waterfront, and a soaring golden bridge, this city truly has it all. \n With trend-defining cuisine ranging from Michelin-starred dining to outrageous food trucks; world-renowned symphony, ballet, theater, and opera; plus almost boundless outdoor adventures, San Francisco justifiably \n stands out as one of the ultimate must-visit cities on any traveler’s wish list. The hardest part may be deciding where to go first. (Well, that and packing for the city’s famously unpredictable weather.) The Golden Gate Bridge is one of the city’s most iconic landmarks; you can walk or bike across the span to the Marin Headlands. Or stay on the San Francisco side and stroll over to the San Francisco Palace of Fine Arts, the Presidio, or Lands End, a rugged, windswept playground where you can watch for whales and check out the ruins of the Sutro Baths."""

In [16]:
splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)
chunks=splitter.split_text(text)
len(chunks)

Created a chunk of size 214, which is longer than the specified 200


4

In [17]:
for chunk in chunks:
    print(len(chunk))

172
124
212
558


In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],
    chunk_size=200,
    chunk_overlap=0,
    length_function = len
)
r_chunks = r_splitter.split_text(text)
for chunk in r_chunks:
    print(len(chunk))

172
124
190
21
194
198
164


In [12]:
!pip install faiss-cpu
!pip install sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp313-cp313-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp313-cp313-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
    --------------------------------------- 0.3/13.7 MB ? eta -:--:--
   - -------------------------------------- 0.5/13.7 MB 891.8 kB/s eta 0:00:15
   - -------------------------------------- 0.5/13.7 MB 891.8 kB/s eta 0:00:15
   -- ------------------------------------- 0.8/13.7 MB 849.7 kB/s eta 0:00:16
   --- ------------------------------------ 1.0/13.7 MB 929.5 kB/s eta 0:00:14
   --- ------------------------------------ 1.0/13.7 MB 929.5 kB/s eta 0:00:14
   --- ------------------------------------ 1.3/13.7 MB 780.7 kB/s eta 0:00:16
   --- ------------------------------------ 1.3/13.7 MB 780.7 kB/s eta 0:00:16
   ---- ----------------------------------- 1.6/13.7 MB 830.1 kB/s eta 0:00:

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

embeddings = model.encode(sentences)
print(embeddings.shape)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(3, 384)


In [15]:
import faiss

print(embeddings)
print(embeddings.shape[1])

#creating an index based on euclidian dist/L2 dist for faster search, of size 384
index = faiss.IndexFlatL2(384)
index.add(embeddings)

[[ 0.01919576  0.12008542  0.15959832 ... -0.00536283 -0.081095
   0.05021336]
 [-0.01869037  0.0415187   0.07431546 ...  0.00486596 -0.06190438
   0.03187513]
 [ 0.13650198  0.08227321 -0.02526165 ...  0.08762047  0.03045842
  -0.01075751]]
384


In [17]:
query = "How is the weather?"
query_vector = model.encode(query)
query_vector.shape

(384,)

In [21]:
import numpy as np

#to search on the index created above, we need 2d vector
two_dim_vector_query = np.array(query_vector).reshape(1,-1)
two_dim_vector_query.shape

(1, 384)

In [22]:
#k is like how many nearest vectors u want
(distances, I) = index.search(two_dim_vector_query, k=2) #returns tuple, 1st one is distances, 2nd one is indexes in original dataframe
print(distances, I)

[[0.673712   0.88967365]] [[0 1]]
