In [None]:
import os
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
# Ensure PyTorch uses a single thread for better efficiency in small-scale tasks
torch.set_num_threads(1)

# Load text files from a directory
def load_text_files_from_directory(directory):
    files_content = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path) and file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                files_content.append(file.read().strip())
    return files_content

# Combine problem statements and editorials
import re
from transformers import AutoTokenizer

# GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

def split_text_for_gpt2_rag(text, max_chunk_tokens=512):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # Predict token count of new chunk
        temp_chunk = current_chunk + " " + sentence if current_chunk else sentence
        token_count = len(tokenizer.encode(temp_chunk, add_special_tokens=False))

        if token_count <= max_chunk_tokens:
            current_chunk = temp_chunk
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence  # start new chunk

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def combine_problems_and_editorials(problem_statements_path, editorials_path, max_chunk_tokens=512):
    problems = load_text_files_from_directory(problem_statements_path)
    editorials = load_text_files_from_directory(editorials_path)

    combined_chunks = []

    for idx, (p, e) in enumerate(zip(problems, editorials)):
        full_text = f"Problem: {p}\n\nEditorial: {e}"
        chunks = split_text_for_gpt2_rag(full_text, max_chunk_tokens=max_chunk_tokens)

        # Optional: tag each chunk with source ID
        for i, chunk in enumerate(chunks):
            tagged_chunk = f"[Doc {idx+1}, Chunk {i+1}]\n{chunk}"
            combined_chunks.append(tagged_chunk)

    return combined_chunks


In [None]:
EDITORIALS_PATH = r"/content/drive/MyDrive/Editorials"
PROBLEM_STATEMENTS_PATH = r"/content/drive/MyDrive/Problem_statement"

# Combine problem statements and editorials
documents = combine_problems_and_editorials(PROBLEM_STATEMENTS_PATH, EDITORIALS_PATH)

In [None]:
import sys
import os

sys.path.append(os.path.abspath('/content/drive/MyDrive/CB4CP'))

In [None]:
# from embeddings import CodeBERTEmbedder
# from vectorstore import VectorStore
# from retriever import RAGRetriever
# from chatbot import CPChatbot

system_message="You are a smart chatbot which solves competitive programming problems from codeforces. You have been trained on question sets from codeforces along with their editorial. You have to provide model solutions and help me to figure out the solutions of the problems. Do not hallucinate.\n"
embedder=CodeBERTEmbedder()
vectorstore=VectorStore()
retriever=RAGRetriever(embedder=embedder,vector_store=vectorstore)
chatbot=CPChatbot(retriever,system_message=system_message)



In [None]:
vectorstore.add_batch(documents)
# vectorstore.save()

In [None]:
user_query=input("Enter query :")
chatbot.chat(user_query)


KeyboardInterrupt: Interrupted by user

After running the cell above to redefine the `VectorStore` class, please re-run the following cells to use the updated class:

- Cell `KqXLDJIplWlu` (Initializes embedder, vectorstore, and retriever)
- Cell `7VnU76-MgIbL` (Initializes chatbot)
- Cell `5Jg4MQnhc-KS` (Adds documents to the vectorstore)
- Cell `L9zedsm-dZDd` (Runs the chat query)

Hopefully, this will resolve the `ValueError`. If you encounter a new error, please provide the full traceback.