### Exercise 1

In [16]:
from sentence_transformers import SentenceTransformer
import numpy as np

print("Libraries imported successfully")

Libraries imported successfully


In [17]:
# Sample document
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Load a small, fast embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded!")



Loading embedding model...
✅ Model loaded!


### Generate Embeddings

In [18]:
# Generate embedding
embeddings = model.encode(sentences)

# print(f"Original text: {sentences}")
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding type: {type(embeddings)}")
print(f"\nFirst 10 values: {embeddings[:10]}")

Embedding shape: (6, 384)
Embedding type: <class 'numpy.ndarray'>

First 10 values: [[ 0.04757566 -0.07015254  0.06429747 ...  0.07358795  0.01249521
   0.01645607]
 [-0.0252566   0.03054357  0.05249949 ...  0.01063142  0.01476685
   0.10832039]
 [ 0.1220389  -0.04751379 -0.00115911 ...  0.08472571  0.06573965
   0.0092331 ]
 [-0.03537082  0.038165   -0.04126012 ...  0.11130317  0.19625439
  -0.02897431]
 [ 0.01665926 -0.04558857  0.02346508 ...  0.02717928 -0.03379644
  -0.05370044]
 [-0.06430853  0.0156419  -0.04678493 ...  0.15115134  0.10791415
  -0.04270948]]


### Calculate Similarity scores

In [19]:
# Function for cosine similarity
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")


# Compare first sentence to all others
print("Comparing to: 'The dog is playing in the park'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")



Similarity function ready!
Comparing to: 'The dog is playing in the park'

Similarity to 'The dog is playing in the park'
Score: 1.000

Similarity to 'A puppy is running outside'
Score: 0.398

Similarity to 'The cat is sleeping on the couch'
Score: 0.071

Similarity to 'Python is a programming language'
Score: 0.099

Similarity to 'Machine learning models need data'
Score: -0.005

Similarity to 'I love coding in Python'
Score: 0.090



In [20]:
# Compare fourth sentence to all others
print("Comparing to: 'TPython is a programming language'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[3], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

Comparing to: 'TPython is a programming language'

Similarity to 'The dog is playing in the park'
Score: 0.099

Similarity to 'A puppy is running outside'
Score: 0.040

Similarity to 'The cat is sleeping on the couch'
Score: 0.020

Similarity to 'Python is a programming language'
Score: 1.000

Similarity to 'Machine learning models need data'
Score: 0.113

Similarity to 'I love coding in Python'
Score: 0.730



## Exercise 2: Chunk size impact on Retrieval

In [21]:
# Document to work on
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""


In [22]:
class SimpleRetriever:
  def __init__(self, model_name='all-MiniLM-L6-v2'):
    """
    Initialize retriever with embedding model.
    """
    self.model = SentenceTransformer(model_name)
    self.chunks = []
    self.embeddings = None

  def add_documents(self, documents, chunk_size=500):
    """
    Add documents to the retriever (chunks and embeds them).
    """

    # Simple chunking (from Module 2)
    for doc in documents:
      words = doc.split()
      for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i+chunk_size])
        self.chunks.append(chunk)

    # Generate embeddings
    print(f"Embedding {len(self.chunks)} chunks ...")
    self.embeddings = self.model.encode(self.chunks)
    print(f"✅ Ready! {len(self.chunks)} chunks indexed")

  def search(self, query, top_k=3):
    """
    Search for relevant chunks
    """
    # Embed query
    query_embedding = self.model.encode(query)

    # Calculate Similarities
    similarities = []
    for i, chunk_emb in enumerate(self.embeddings):
      sim = cosine_similarity(query_embedding, chunk_emb)
      similarities.append((self.chunks[i], sim))

    # Sort and return top k
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]
  
print("SimpleRetriever class ready!")

    

SimpleRetriever class ready!


### Small chunks

In [30]:

# Create retriever and add documents
retriever = SimpleRetriever()
retriever.add_documents(document, chunk_size=100)

# Test searches
test_queries = [
    "What is Machine learning",
    "How can i classify Natural Language Processing"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print(f"{'='*80}")
    results = retriever.search(query, top_k=3)
    for i, (chunk, score) in enumerate(results, 1):
        print(f"\nResult {i} (Score: {score:.3f}):")
        print(chunk.strip())

Embedding 1294 chunks ...
✅ Ready! 1294 chunks indexed

Query: What is Machine learning

Result 1 (Score: 0.140):
k

Result 2 (Score: 0.140):
k

Result 3 (Score: 0.140):
k

Query: How can i classify Natural Language Processing

Result 1 (Score: 0.118):
A

Result 2 (Score: 0.118):
a

Result 3 (Score: 0.118):
A


In [28]:
# Document to work on
document2 = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""


#### Medium chunks (200 Characters)

In [29]:

# Create retriever and add documents
retriever = SimpleRetriever()
retriever.add_documents(document2, chunk_size=200)

# Test searches
test_queries = [
    "What is Machine learning"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print(f"{'='*80}")
    results = retriever.search(query, top_k=3)
    for i, (chunk, score) in enumerate(results, 1):
        print(f"\nResult {i} (Score: {score:.3f}):")
        print(chunk.strip())

Embedding 1294 chunks ...
✅ Ready! 1294 chunks indexed

Query: What is Machine learning

Result 1 (Score: 0.140):
k

Result 2 (Score: 0.140):
k

Result 3 (Score: 0.140):
k


#### Large Characters (400 Characters)

In [25]:

# Create retriever and add documents
retriever = SimpleRetriever()
retriever.add_documents(document, chunk_size=400)

# Test searches
test_queries = [
    "What is Machine learning"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print(f"{'='*80}")
    results = retriever.search(query, top_k=3)
    for i, (chunk, score) in enumerate(results, 1):
        print(f"\nResult {i} (Score: {score:.3f}):")
        print(chunk.strip())

Embedding 1294 chunks ...
✅ Ready! 1294 chunks indexed

Query: What is Machine learning

Result 1 (Score: 0.140):
k

Result 2 (Score: 0.140):
k

Result 3 (Score: 0.140):
k
