In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("/Users/reem/Desktop/Langchain_tutorials/covered topics.pdf"),
    PyPDFLoader("/Users/reem/Desktop/Langchain_tutorials/llm.pdf"),
    # PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture02.pdf"),
    # PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [4]:
splits = text_splitter.split_documents(docs)

In [5]:
len(splits)

544

## Embeddings

Let's take our splits and embed them.

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

  embedding = OpenAIEmbeddings()


In [7]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [9]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [10]:
import numpy as np

In [11]:
np.dot(embedding1, embedding2)

0.9631510802407719

In [12]:
np.dot(embedding1, embedding3)

0.7702031204123156

In [13]:
np.dot(embedding2, embedding3)

0.7590539714454778

In [14]:
#using cosine similariey
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare similarities between sentences
sim_1_2 = cosine_similarity(embedding1, embedding2)
sim_1_3 = cosine_similarity(embedding1, embedding3)
sim_2_3 = cosine_similarity(embedding2, embedding3)

print(f"Similarity between 'i like dogs' and 'i like canines': {sim_1_2:.4f}")
print(f"Similarity between 'i like dogs' and 'the weather is ugly outside': {sim_1_3:.4f}")
print(f"Similarity between 'i like canines' and 'the weather is ugly outside': {sim_2_3:.4f}")

Similarity between 'i like dogs' and 'i like canines': 0.9632
Similarity between 'i like dogs' and 'the weather is ugly outside': 0.7702
Similarity between 'i like canines' and 'the weather is ugly outside': 0.7591


## Vectorstores

In [15]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
import chromadb

In [16]:
persist_directory = 'docs/chroma/'

In [17]:
!rm -rf ./docs/chroma  # remove old database files if any

In [18]:
# Create the vector database
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
    )

In [19]:
print(vectordb._collection.count())

544


In [20]:
q= "what are the projects? "

In [21]:
docs= vectordb.similarity_search(q, k=2)

In [25]:
print(docs[0].page_content)

4 
Projects 
• Customized Object Detection Using YOLO: 
- Defining Program Pipeline 
- Data Collection And Labeling Using RoboFlow 
- Data Annotation And Augmentation Using RoboFlow 
- Training The Pre-Trained Model On Customized Dataset 
- Utilizing(epochs, Precision, Recall) 
- Testing And Deploying The Model 
Reference: 
https://drive.google.com/file/d/1qEmj6giDQuqDh1LnE54mXFXjTUxauXkj/vi
ew?usp=share_link 
 
• Face Detection And Tracking: 
- Defining Project Pipeline And Requirements 
- Custom Dataset Processing (Collection, Labelling, Annotation) 
- Fine Tuning Pre-Trained Model (YOLOv11) 
- Testing The Model Then Deploying it  
Reference: 
https://drive.google.com/file/d/1lhWVmCKeEFongyZQT9SWpgKzpNdBqKzI/
view?usp=share_link 
 
• RAG Chatbot Using DeepSeek-r1: 
- Defining The Pipeline 
- Installing And Importing Needed Libraries (Ollama, Gradio, 
LangChain, ChromaDB)
