# Custom Chatbot - Jupyter Notebook Structure

### This notebook demonstrates a structured approach to building a custom chatbot using OpenAI's embeddings and completion models, designed for interactive execution within a Jupyter environment.

### Setup and Imports

### First, we'll import all necessary libraries. It's good practice to put all imports at the top of your notebook.

In [1]:
import pandas as pd
import tiktoken
import openai
from openai.embeddings_utils import get_embedding, distances_from_embeddings
import numpy as np
import os

ModuleNotFoundError: No module named 'openai.embeddings_utils'

## Chatbot Class Definition
### We define the `Chatbot` class, encapsulating all the core logic for data loading, embedding generation, context retrieval, and answer generation. 
### This makes the code modular and reusable.

In [None]:
class Chatbot:
    """
    A class to build and run a custom chatbot using OpenAI's embeddings and completion models.
    """

    def __init__(self, api_base: str, api_key: str, embedding_model: str = "text-embedding-ada-002"):
        """
        Initializes the Chatbot with OpenAI API credentials and models.

        Args:
            api_base (str): The base URL for the OpenAI API.
            api_key (str): The API key for OpenAI.
            embedding_model (str): The name of the embedding model to use.
        """
        openai.api_base = api_base
        openai.api_key = api_key
        self.embedding_model = embedding_model
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self.data_df = None # DataFrame to store text and embeddings
        self.prompt_template = """
Answer the question based on the context below, and if the question can't be answered, say "I don't know"

Context:

{}

---

Question: {}
Answer:
"""
        self.max_token_count = 2000

    def load_and_process_data(self, file_path: str, text_column: str, num_rows: int = None):
        """
        Loads data from a CSV file, extracts text, and generates embeddings.

        Args:
            file_path (str): The path to the CSV file.
            text_column (str): The name of the column containing the text reviews.
            num_rows (int, optional): Number of rows to read from the CSV. Defaults to None (all rows).

        Returns:
            bool: True if data was loaded and processed successfully, False otherwise.
        """
        try:
            if not os.path.exists(file_path):
                print(f"Error: File not found at {file_path}")
                return False

            df = pd.read_csv(file_path, nrows=num_rows)
            if text_column not in df.columns:
                print(f"Error: Text column '{text_column}' not found in the DataFrame.")
                return False

            self.data_df = pd.DataFrame(df[text_column].copy())
            self.data_df.rename(columns={text_column: "text"}, inplace=True)

            print(f"Generating embeddings for {len(self.data_df)} texts...")
            response = openai.Embedding.create(
                input=self.data_df["text"].tolist(),
                model=self.embedding_model
            )
            self.data_df['embeddings'] = [data["embedding"] for data in response["data"]]
            print("Embeddings generated successfully.")
            return True
        except Exception as e:
            print(f"Error loading or processing data: {e}")
            return False

    def get_relevant_context(self, question: str) -> str:
        """
        Finds the most relevant context from the loaded data based on the question's embedding.

        Args:
            question (str): The user's question.

        Returns:
            str: A string containing the most relevant context, joined by '###'.
        """
        if self.data_df is None or self.data_df.empty:
            print("Error: No data loaded. Please call load_and_process_data first.")
            return ""

        try:
            question_embedding = get_embedding(question, engine=self.embedding_model)
            self.data_df["distances"] = distances_from_embeddings(
                question_embedding, self.data_df['embeddings'].tolist(), distance_metric="cosine"
            )
            sorted_df = self.data_df.sort_values(by=["distances"], ascending=True)

            current_token_count = len(self.tokenizer.encode(self.prompt_template)) + \
                                  len(self.tokenizer.encode(question))

            context_texts = []
            for text in sorted_df["text"].values:
                text_token_count = len(self.tokenizer.encode(text))
                if current_token_count + text_token_count <= self.max_token_count:
                    context_texts.append(text)
                    current_token_count += text_token_count
                else:
                    break
            return "\n\n###\n\n".join(context_texts)
        except Exception as e:
            print(f"Error getting relevant context: {e}")
            return ""

    def generate_answer(self, question: str, context: str, model: str = "gpt-3.5-turbo-instruct") -> str:
        """
        Generates an answer to the question using the OpenAI completion model and provided context.

        Args:
            question (str): The user's question.
            context (str): The relevant context retrieved for the question.
            model (str): The OpenAI completion model to use.

        Returns:
            str: The generated answer.
        """
        if not context:
            return "I don't have enough information to answer that question."

        full_prompt = self.prompt_template.format(context, question)
        try:
            response = openai.Completion.create(model=model, prompt=full_prompt)
            return response["choices"][0]["text"].strip()
        except Exception as e:
            print(f"Error generating answer: {e}")
            return "I encountered an error while trying to answer your question."

    def ask(self, question: str) -> str:
        """
        High-level method to ask a question to the chatbot.

        Args:
            question (str): The user's question.

        Returns:
            str: The chatbot's answer.
        """
        print(f"\nUser Question: {question}")
        context = self.get_relevant_context(question)
        if not context:
            return "I couldn't find any relevant information in my knowledge base."
        answer = self.generate_answer(question, context)
        print(f"Chatbot Answer: {answer}")
        return answer

## Configuration and Initialization
### Here, we define our API keys and data file paths. It's crucial to replace "YOUR_OPENAI_API_KEY" with your actual key.

In [None]:
# Configuration
API_BASE = "https://openai.vocareum.com/v1"
API_KEY = "voc-###################" # Replace with your actual API key or environment variable
DATA_FILE = "data/books_rating.csv"
TEXT_COLUMN = "review/text"
NUM_ROWS_TO_LOAD = 100 # For demonstration, load only 100 rows

# Initialize the chatbot
chatbot = Chatbot(api_base=API_BASE, api_key=API_KEY)

### Load and Process Data

### This cell will load your data and generate embeddings. This step might take some time depending on the size of your dataset.

In [None]:
# Load and process data
if chatbot.load_and_process_data(DATA_FILE, TEXT_COLUMN, NUM_ROWS_TO_LOAD):
    print("\nChatbot is ready to answer questions!")
else:
    print("Chatbot could not be initialized due to data loading issues. Please check the file path and column name.")

## Ask Questions
### Now you can interact with the chatbot by asking questions. Each chatbot.ask() call will retrieve relevant context and generate an answer.

In [None]:
# Example Questions
print("\n--- Asking Questions ---")
chatbot.ask("Was the review positive or negative?")
chatbot.ask("How many reviews are regarding Dr. Seuss?")
chatbot.ask("What is the general sentiment about the books?")
chatbot.ask("Tell me about the most common themes in the reviews.")
chatbot.ask("What is the average rating of the books?") # This might lead to "I don't know" if context doesn't contain ratings