In [2]:
import os
import ollama
import json

# Function to split text into chunks
def split_into_chunks(text, max_length=512):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_length:
            chunks.append('. '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    if current_chunk:
        chunks.append('. '.join(current_chunk))

    return chunks

# Directory containing the text files
directory = '/home/vishnu/Desktop/wiki/process_chunks'

# List to store the embeddings
embeddings_data = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r') as file:
            # Read the content of the file
            content = file.read()
            # Split document into chunks
            chunks = split_into_chunks(content)

            for chunk in chunks:
                # Generate embeddings for each chunk
                response = ollama.embeddings(model="all-minilm", prompt=chunk)
                embeddings_data.append({
                    "text": chunk,
                    "embedding": response["embedding"]
                })

# Save embeddings to a JSON file
with open('embeddings.json', 'w') as f:
    json.dump(embeddings_data, f)

# print("Embeddings saved to embeddings.json")  # Uncomment to see confirmation
