In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

print("✅ Libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


✅ Libraries imported successfully!


In [None]:
# Load a small, fast embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded!")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

In [None]:
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

In [None]:

# Generate embedding
embedding = model.encode(sentences)

print(f"Original text: {sentences}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding type: {type(embedding)}")
print(f"\nFirst 10 values: {embedding[:10]}")

In [None]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("✅ Similarity function ready!")

In [None]:
# Create test sentences
sentences = [
     "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# Compare first sentence to all others
print("Comparing to: 'The dog is playing in the park'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

In [None]:
# Create test sentences
sentences = [
     "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# Compare first sentence to all others
print("Comparing to: 'Python is a programming language'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[3], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

Excercise 2

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

class SimpleRetriever:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize retriever with embedding model.
        """
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.embeddings = None

    def add_document(self, document, chunk_size):
        """
        Chunk a single document and add its chunks and embeddings to the retriever.
        """
        self.chunks = []
        for i in range(0, len(document), chunk_size):
            chunk = document[i:i+chunk_size]
            chunk = " ".join(chunk.strip().split())
            if chunk:
                self.chunks.append(chunk)
        
        # Generate embeddings
        self.embeddings = self.model.encode(self.chunks)

    def search(self, query, top_k=3):
        """
        Search for relevant chunks.
        """
        query_embedding = self.model.encode(query)
        
        # Calculate similarities
        similarities = []
        for i, chunk_emb in enumerate(self.embeddings):
            sim = cosine_similarity(query_embedding, chunk_emb)
            similarities.append((self.chunks[i], sim))
        
        # Sort and return top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""

query = "What is machine learning?"
retriever = SimpleRetriever()
chunk_sizes = [100, 200, 400]
results_summary = {}

print("Chunk Size Comparison:\n")

for size in chunk_sizes:
    retriever.add_document(document, chunk_size=size)
    results = retriever.search(query, top_k=3)
    
    print(f"{'='*30}")
    print(f"Chunk Size: {size} characters")
    print(f"{'='*30}")
    print(f"- Number of chunks: {len(retriever.chunks)}")
    
    if results:
        top_chunk, top_score = results[0]
        print(f"- Top result: \"{top_chunk}\"")
        print(f"- Score: {top_score:.3f}")
        
        analysis = ""
        if size == 100:
            analysis = "Chunks are too small, cutting sentences mid-thought. The top result captures the phrase 'Machine learning is' but lacks the full definition, leading to a less focused answer."
        elif size == 200:
            analysis = "This size captures complete or near-complete sentences containing the core definition of machine learning. The top result provides a clear, focused answer to the query."
        elif size == 400:
            analysis = "The chunk is larger and contains the definition, but it also includes less relevant information about data science. While complete, it is less focused than the medium chunk."
        print(f"- Analysis: {analysis}\n")
        
        results_summary[size] = {"top_chunk": top_chunk, "score": top_score}
    else:
        print("- No results found.\n")

# --- Final Comparison ---

print("Best chunk size for this use case: **200 characters**")
print("because it provided the most **focused** answer. The chunk size was sufficient to capture the complete definition of 'machine learning' in a single chunk without including too much extraneous information, as was the case with the larger chunk size. The smaller chunks were too fragmented to provide a complete answer.")