In [11]:
# load the conversation data from exampleConversation.txt
# each line is a segment from a single speaker (possibly containing multiple sentences)

import os
import re  # added to split segments into sentences

def load_conversation(file_path):
    """
    Load conversation data from a file and split each segment into sentences.
    
    Args:
        file_path (str): Path to the conversation file.
    
    Returns:
        list: List of individual sentences.
    """
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    conversation = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                # Split the segment into sentences using punctuation as delimiters
                sentences = re.split(r'(?<=[.!?])\s+', line)
                conversation.extend(sentences)
    
    return conversation

fileName = "exampleConversation.txt"
conversation = load_conversation(fileName)

def chunk_sentences(sentences, chunk_size):
    """
    Splits a list of sentences into chunks with a two-sentence overlap.
    
    Args:
        sentences (list): List of sentences.
        chunk_size (int): Number of sentences per chunk (must be >= 3).
    
    Returns:
        list: A list of chunks, each a list of sentences.
    """
    if chunk_size < 3:
        raise ValueError("chunk_size must be at least 3")
    
    chunks = []
    # The step is chunk_size minus the two overlapping sentences
    step = chunk_size - 2
    total = len(sentences)
    
    for i in range(0, total, step):
        if i + chunk_size > total:
            break  # or optionally add the last chunk even if incomplete
        chunks.append(sentences[i : i + chunk_size])
    
    return chunks

# Example usage:
# This will create, for example, chunks like:
# [sentences[0], sentences[1], sentences[2]], then [sentences[1], sentences[2], sentences[3]], etc.
chunks = chunk_sentences(conversation, 3)
print(f"Total chunks (chunk size 3): {len(chunks)}")
for i, chunk in enumerate(chunks, start=1):
    print(f"Chunk {i}: {chunk}")

# Printing some statistics about the conversation
print(f"Number of sentences: {len(conversation)}")
# Number of words in the conversation
word_count = sum(len(sentence.split()) for sentence in conversation)
print(f"Total number of words: {word_count}")


Total chunks (chunk size 3): 133
Chunk 1: ['Microsoft co-founder and philanthropist Bill Gates  is the pioneering tech titan  who helped lead the computer revolution  and plans to donate the majority  of his multi-billion dollar fortune to global causes.', "Now he's sharing his journey  and how it began in his new memoir, Source Code.", 'Please welcome, for the very first time,  on The View, Bill Gates.']
Chunk 2: ["Now he's sharing his journey  and how it began in his new memoir, Source Code.", 'Please welcome, for the very first time,  on The View, Bill Gates.', 'So, first of all, welcome to The View.']
Chunk 3: ['Please welcome, for the very first time,  on The View, Bill Gates.', 'So, first of all, welcome to The View.', 'Wow.']
Chunk 4: ['So, first of all, welcome to The View.', 'Wow.', "But secondly, it's kind of great to have you  at such an important time for the country  because it seems like today's tech billionaires  are shifting rightward and seem to have more power  than e

In [12]:
import logging
from sentence_transformers import SentenceTransformer

# Set logging to INFO level so that you can see more details in console output
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

sentences = chunks  # your list of sentences
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Encode with the progress bar enabled
embeddings = model.encode(sentences, show_progress_bar=True, batch_size=32)


2025-04-08 18:44:57,720 - INFO - Use pytorch device_name: cuda:0
2025-04-08 18:44:57,720 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-04-08 18:44:57,720 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
# Small code snippet to estimate the token count of the conversation text for OpenAI API
import tiktoken

# Choose the model name for which you want to count tokens; e.g., "gpt-4"
model_name = "gpt-4"
enc = tiktoken.encoding_for_model(model_name)

# Flatten the conversation list to a single string
conversation_text = " ".join(conversation)

num_tokens = len(enc.encode(conversation_text))
print(f"Estimated token count: {num_tokens}")


Estimated token count: 1770


In [13]:
import numpy as np
# Method to fancy print some statistics about the embeddings
def print_statistics(embeddings):
    """
    Print statistics about the embeddings.
    
    Args:
        embeddings (list): List of embeddings.
    """
    
    print(f"Number of sentences: {len(embeddings)}")
    print(f"Embedding size: {len(embeddings[0])}")
    print(f"First embedding: {embeddings[0][:5]}...")  # Print first 5 dimensions
    print(f"Size of file: {os.path.getsize('exampleConversation_embeddings.npy') / (1024 * 1024):.2f} MB")


# Save the embeddings to a file
embeddings_file = "exampleConversation_embeddings.npy"
np.save(embeddings_file, embeddings)
print(f"Embeddings saved to {embeddings_file}")

# Print statistics about the embeddings
print_statistics(embeddings)

Embeddings saved to exampleConversation_embeddings.npy
Number of sentences: 133
Embedding size: 768
First embedding: [-0.02305532  0.0953411   0.01701605  0.01112299 -0.02063525]...
Size of file: 0.39 MB


In [15]:
import numpy as np

# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Function to find the top k most similar segments given a query
def find_top_k(query, embeddings, sentences, k=2):
    # Compute the embedding for the query
    query_embedding = model.encode([query])[0]
    
    # Compute cosine similarities between query and each embedding
    similarities = np.array([cosine_similarity(query_embedding, emb) for emb in embeddings])
    
    # Get indices of the top k most similar embeddings (higher similarity first)
    top_k_indices = similarities.argsort()[-k:][::-1]
    
    # Return the corresponding text segments with their similarity scores
    return [(sentences[i], similarities[i]) for i in top_k_indices]

# Example usage
query = input("Enter a query: ")
results = find_top_k(query, embeddings, chunks, k=3)

print("\nTop relevant conversation segments:")
for i, (segment, score) in enumerate(results, start=1):
    print(f"{i}. (Score: {score:.3f}) {segment}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top relevant conversation segments:
1. (Score: 0.821) ["And it won't be the last pandemic.", 'The next one could be far more severe.', 'I mean, this one, it killed millions.']
2. (Score: 0.717) ['Yeah, the pandemic, sadly, was fairly predictable.', "And it won't be the last pandemic.", 'The next one could be far more severe.']
3. (Score: 0.708) ['I was reading, though, that you said the next pandemic, you are predicting within maybe 25 years.', 'Is that accurate?', "Oh, certainly we'll have one in the next 25 years."]
