## Document Loaders

#### PDF Loader

In [13]:
from langchain_community.document_loaders import PyPDFLoader

In [14]:
loader = PyPDFLoader("IS_Book.pdf")

In [15]:
pdf_pages = loader.load()

Ignoring wrong pointing object 0 0 (offset 0)
Ignoring wrong pointing object 1052 0 (offset 0)
Ignoring wrong pointing object 1071 0 (offset 0)
Ignoring wrong pointing object 1347 0 (offset 0)
Ignoring wrong pointing object 1852 0 (offset 0)


In [16]:
pdf_pages[0]

Document(metadata={'producer': 'Mac OS X 10.4.10 Quartz PDFContext', 'creator': 'NeoOffice', 'creationdate': '2007-10-16T17:41:55-04:00', 'author': 'Richard Watson', 'moddate': '2007-10-16T17:41:55-04:00', 'title': 'IS book.odm', 'source': 'IS_Book.pdf', 'total_pages': 221, 'page': 0, 'page_label': '1'}, page_content='Information SystemsRichard T. Watson (editor)University of Georgia\nCopyright © 2007 by the Global Text Project\nThis book is licensed under a Creative Commons Attribution 3.0 License')

#### Youtube Video Loader

In [17]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio import FasterWhisperParser
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [18]:
url = "https://www.youtube.com/watch?v=ijAMJqtDJlo"

In [19]:
video_dir = "docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader(urls=[url],save_dir=video_dir),
    FasterWhisperParser()
)
docs = loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=ijAMJqtDJlo
[youtube] ijAMJqtDJlo: Downloading webpage




[youtube] ijAMJqtDJlo: Downloading android vr player API JSON
[info] ijAMJqtDJlo: Downloading 1 format(s): 140
[download] docs\youtube\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a has already been downloaded
[download] 100% of    8.37MiB
[ExtractAudio] Not converting audio docs\youtube\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a; file is already in target format m4a


In [20]:
docs[0]

Document(metadata={'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'timestamps': '[0.00s -> 12.94s]', 'language': 'hi', 'probability': '100%'}, page_content=' Seo')

In [21]:
combined_docs = pdf_pages + docs

In [22]:
combined_docs[0]

Document(metadata={'producer': 'Mac OS X 10.4.10 Quartz PDFContext', 'creator': 'NeoOffice', 'creationdate': '2007-10-16T17:41:55-04:00', 'author': 'Richard Watson', 'moddate': '2007-10-16T17:41:55-04:00', 'title': 'IS book.odm', 'source': 'IS_Book.pdf', 'total_pages': 221, 'page': 0, 'page_label': '1'}, page_content='Information SystemsRichard T. Watson (editor)University of Georgia\nCopyright © 2007 by the Global Text Project\nThis book is licensed under a Creative Commons Attribution 3.0 License')

In [23]:
len(combined_docs)

483

#### Chunking

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=20)
texts = text_splitter.split_documents(combined_docs)

In [26]:
len(texts)

1334

#### Embeddings

In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
ml_embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

  ml_embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|█████████████████████| 391/391 [00:01<00:00, 312.70it/s, Materializing param=pooler.dense.weight]
[1mXLMRobertaModel LOAD REPORT[0m from: intfloat/multilingual-e5-large
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [3]:
text1 = "Hello I am Windows"
text2 = "Hello I am Linux"
text3 = "Tesla is Perfect"

In [4]:
embedding1 = ml_embeddings.embed_query(text1)
embedding2 = ml_embeddings.embed_query(text2)
embedding3 = ml_embeddings.embed_query(text3)

In [5]:
print(len(embedding1),
len(embedding2),
len(embedding3))

1024 1024 1024


In [6]:
import numpy as np

In [7]:
np.dot(embedding1,embedding2)

np.float64(0.9368696165766722)

In [8]:
np.dot(embedding2,embedding3)

np.float64(0.8179120134146922)

In [9]:
np.dot(embedding1,embedding3)

np.float64(0.8146183240731355)

In [10]:
from langchain_community.vectorstores import Chroma

In [27]:
vector_db = Chroma.from_documents(documents=texts,embedding=ml_embeddings,persist_directory="/db/chroma/")

In [28]:
vector_db.similarity_search("What is Information system")

[Document(metadata={'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'timestamps': '[392.36s -> 394.36s]', 'language': 'hi', 'probability': '100%'}, page_content='So, what is the information system?'),
 Document(metadata={'timestamps': '[527.36s -> 529.36s]', 'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'language': 'hi', 'probability': '100%'}, page_content='So, what is the information system?'),
 Document(metadata={'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'timestamps': '[142.64s -> 144.32s]', 'probability': '100%', 'language': 'hi'}, page_content='what will happen to the information system'),
 Document(metadata={'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.

In [29]:
vector_db.similarity_search_with_score("What is Information Systems")

[(Document(metadata={'probability': '100%', 'language': 'hi', 'timestamps': '[392.36s -> 394.36s]', 'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a'}, page_content='So, what is the information system?'),
  0.15758031606674194),
 (Document(metadata={'probability': '100%', 'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'timestamps': '[527.36s -> 529.36s]', 'language': 'hi'}, page_content='So, what is the information system?'),
  0.15758031606674194),
 (Document(metadata={'source': 'docs\\youtube\\8｜ Introduction to Information Systems, Business applications, Information Systems and E Business.m4a', 'language': 'hi', 'probability': '100%', 'timestamps': '[271.16s -> 273.08s]'}, page_content='there are some information systems'),
  0.2061801254749298),
 (Document(metadata={'language': 'hi', 'probability': '100%', 'timestamps': '[

#### Retrival

In [30]:
pip install langgraph

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
