In [None]:
!pip install faiss-cpu

In [None]:
import pandas as pd
import openai
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import os
import torch
from google.colab import userdata

### Setup

In [None]:
llm_choice = "gpt"
openai.api_key = userdata.get("OPENAI_API_KEY")
client = openai.Client(api_key=userdata.get("OPENAI_API_KEY"))

# Set environment variable to prevent runtime issues
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Load tokenizer and model for embedding
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")

# gpu for embed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

### Load knowledge data

In [None]:
# Load knowledge database
rag_db = pd.read_csv("./dataset/kd.csv")
rag_data = rag_db[['ID', 'Content']].dropna()

# Load knowledge database
rag_data['Content'] = rag_data['Content'].str.lower()
documents = rag_data['Content'].tolist()
doc_ids = rag_data['ID'].tolist()

### embed knowledge text

In [None]:


def embed(documents, batch_size=20):
    """Generate embeddings in batches to prevent memory overflow."""
    all_embeddings = []
    for i in range(0, len(documents), batch_size):
        print(f"embedding {i} / {len(documents)}")
        batch = documents[i:i+batch_size]
        # Tokenize the batch with dynamic padding
        inputs = tokenizer(batch, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        # move input to GPU
        inputs = {key: value.to(device) for key, value in inputs.items()}
        # Disable gradient calculation for inference
        with torch.no_grad():
            outputs = model(**inputs)
        #
        embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# Define the path for saving/loading embeddings
embeddings_path = './dataset/doc_embeddings.npy'
if os.path.exists(embeddings_path):
    print("Loading embeddings from file...")
    # Load embeddings using memory mapping to avoid loading the entire file into RAM
    doc_embeddings = np.load(embeddings_path, mmap_mode='r')
else:
    print("Generating embeddings in batches...")
    doc_embeddings = embed(documents)
    np.save(embeddings_path, doc_embeddings)  # Save embeddings to disk

# Build the FAISS index using L2 distance
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)
print("FAISS indexing complete!")

### Batch Inference

In [None]:
# Retrieve and validate documents
def retrieve_documents(queries, k=3):
    """
    Efficiently retrieves relevant documents for a batch of queries.

    Args:
        queries (list of str): A list of query strings.
        k (int): The number of documents to retrieve per query.

    Returns:
        list of dict: A list where each element corresponds to a query and contains
                      the keys 'ids' and 'contents' for the retrieved documents.
    """
    # Embed all queries at once to avoid redundant computations
    query_embeddings = embed([q.strip().lower() for q in queries])

    # Perform batch search in the index
    distances, indices = index.search(np.array(query_embeddings), k)

    # Retrieve and return documents and their IDs for each query
    results = []
    for query_indices in indices:
        result = {
            "ids": [doc_ids[i] for i in query_indices],
            "contents": [documents[i] for i in query_indices]
        }
        results.append(result)
    return results

# Roughly truncates the input text to a maximum number of characters.
def truncate_text(text, max_chars=4096):
    return text[:max_chars] if len(text) > max_chars else text


def generate_responses(queries, contexts):
    """
    Efficiently generates responses for a batch of queries.

    Args:
        queries (list of str): A list of query strings.
        contexts (list of str): A list of context strings corresponding to each query.

    Returns:
        list: A list of generated responses.
    """
    messages_batch = [
        [
            {"role": "system", "content": (
                "You are recognized as a Kubernetes and NGINX ingress expert. Before providing an answer, validate the provided context for "
                "errors, deprecated features, or potential conflicts. Always adhere to the latest Kubernetes and NGINX standards. "
                "Identify and clearly explain any assumptions made based on the context, and provide necessary corrections or enhancements."
            )},
            {"role": "user", "content": (
                f"Given the following detailed context and choose what you think fit information for question:\n{context}\nCan you provide a validated and comprehensive response to this query:\n{query}\n"
                "Your response should:\n"
                "1. Include YAML configurations with accurate and effective annotations tailored to address the query.\n"
                "2. Explain the rationale behind each configuration and validate them against the provided context and current best practices.\n"
                "3. Highlight and discuss any potential issues or critical assumptions that could affect the implementation.\n"
                "4. Offer detailed debugging steps and troubleshooting advice to verify and refine the solution."
            )}
        ] for query, context in zip(queries, contexts)
    ]

    responses = []
    for messages in messages_batch:
        try:
            # Fix for openai 1.0.0+
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=messages,
                max_tokens=4090,
                temperature=0
            )
            responses.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error: {e}")
            responses.append("Error occurred.")

    return responses


def process_questions(file_path, batch_size=40):
    # Load CSV file into a Pandas DataFrame
    df = pd.read_csv(file_path, encoding="utf-8")

    # Ensure necessary columns exist
    context_col_name = f"{llm_choice}_Top_3_Contexts"
    response_col_name = f"{llm_choice}_Generated_Response"
    context_id_col_name = f"{llm_choice}_Context_IDs"

    if context_col_name not in df.columns:
        df[context_col_name] = ""  # Initialize with empty strings
    if response_col_name not in df.columns:
        df[response_col_name] = ""  # Initialize with empty strings
    if context_id_col_name not in df.columns:
        df[context_id_col_name] = ""  # Initialize with empty strings

    # Get list of unanswered questions
    unanswered_mask = df[f"{llm_choice}_Generated_Response"] == ""
    unanswered_df = df[unanswered_mask]

    # Process questions in batches
    for start in range(0, len(unanswered_df), batch_size):
        batch = unanswered_df.iloc[start : start + batch_size]

        queries = batch["Question Body"].tolist()

        # Retrieve context in batch (each result is a dict with 'ids' and 'contents')
        results = retrieve_documents(queries, k=3)
        # For each result, truncate each document's content to 1000 tokens and join them
        contexts = [
            " ".join([truncate_text(doc) for doc in result["contents"]])
            for result in results
        ]
        # Create context_id strings (concatenated document IDs)
        context_ids = [", ".join([str(doc_id) for doc_id in result['ids']]) for result in results]

        # Generate responses in batch
        responses = generate_responses(queries, contexts)

        # Update the DataFrame with responses
        df.loc[batch.index, response_col_name] = responses
        df.loc[batch.index, context_col_name] = contexts
        df.loc[batch.index, context_id_col_name] = context_ids
        print(f"Processed {start + len(batch)} / {len(unanswered_df)} questions")

    # Save the updated CSV file with responses
    df.to_csv(file_path, index=False, encoding="utf-8")
    print(f"All questions processed and saved back to {file_path}")

# Run batch processing
file_path = "./dataset/test.csv"  # Replace with your CSV file path
process_questions(file_path)
