In [None]:
!pip install faiss-cpu

In [26]:
import pandas as pd
import openai
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import os
from google.colab import userdata

### Setup

In [None]:
llm_choice = "gpt"
openai.api_key = userdata.get("OPENAI_API_KEY")
client = openai.Client(api_key=userdata.get("OPENAI_API_KEY"))

# Set environment variable to prevent runtime issues
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Load tokenizer and model for embedding
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")

### Load knowledge data

In [6]:
# Load knowledge database
rag_db = pd.read_csv("./dataset/kd.csv")

### embed knowledge text

In [None]:
rag_data = rag_db[['ID', 'Concept']].dropna()

# Convert knowledge content to lowercase for better matching
rag_data['Concept'] = rag_data['Concept'].str.lower()

# Embed documents from RAG for retrieval
documents = rag_data['Concept'].tolist()
def embed(documents):
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

if os.path.exists('./dataset/doc_embeddings.npy'):
    print("Loading embeddings from file...")
    doc_embeddings = np.load('./dataset/doc_embeddings.npy')
else:
    print("Generating embeddings...")
    doc_embeddings = embed(documents)
    np.save('./dataset/doc_embeddings.npy', doc_embeddings)

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

### Batch Inference

In [None]:
# Retrieve and validate documents
def retrieve_documents(queries, k=3):
    """
    Efficiently retrieves relevant documents for a batch of queries.
    
    Args:
        queries (list of str): A list of query strings.
        k (int): The number of documents to retrieve per query.
    
    Returns:
        list of list: A list containing retrieved documents for each query.
    """
    # Embed all queries at once to avoid redundant computations
    query_embeddings = embed([q.strip().lower() for q in queries])
    
    # Perform batch search in the index
    distances, indices = index.search(np.array(query_embeddings), k)

    # Retrieve and return documents for each query
    return [[documents[i] for i in query_indices] for query_indices in indices]

def generate_responses(queries, contexts):
    """
    Efficiently generates responses for a batch of queries.
    
    Args:
        queries (list of str): A list of query strings.
        contexts (list of str): A list of context strings corresponding to each query.

    Returns:
        list: A list of generated responses.
    """
    messages_batch = [
        [
            {"role": "system", "content": (
                "You are recognized as a Kubernetes and NGINX ingress expert. Before providing an answer, validate the provided context for "
                "errors, deprecated features, or potential conflicts. Always adhere to the latest Kubernetes and NGINX standards. "
                "Identify and clearly explain any assumptions made based on the context, and provide necessary corrections or enhancements."
            )},
            {"role": "user", "content": (
                f"Given the following detailed context and choose what you think fit information for question:\n{context}\nCan you provide a validated and comprehensive response to this query:\n{query}\n"
                "Your response should:\n"
                "1. Include YAML configurations with accurate and effective annotations tailored to address the query.\n"
                "2. Explain the rationale behind each configuration and validate them against the provided context and current best practices.\n"
                "3. Highlight and discuss any potential issues or critical assumptions that could affect the implementation.\n"
                "4. Offer detailed debugging steps and troubleshooting advice to verify and refine the solution."
            )}
        ] for query, context in zip(queries, contexts)
    ]

    responses = []
    for messages in messages_batch:
        try:
            #fix for openai 1.0.0+
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=messages,
                max_tokens=4090,
                temperature=0
            )
            responses.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error: {e}")
            responses.append("Error occurred.")

    return responses

# Load CSV, process questions, and update responses
def process_questions(file_path, batch_size=10):
    # Load CSV file into a Pandas DataFrame
    df = pd.read_csv(file_path, encoding="utf-8")

    # Ensure necessary columns exist
    context_col_name = f"{llm_choice}_Top_3_Contexts"
    response_col_name = f"{llm_choice}_Generated_Response"

    if context_col_name not in df.columns:
        df[context_col_name] = ""  # Initialize with empty strings

    if response_col_name not in df.columns:
        df[response_col_name] = ""  # Initialize with empty strings

    # Get list of unanswered questions
    unanswered_mask = df[f"{llm_choice}_Generated_Response"] == ""
    unanswered_df = df[unanswered_mask]

    # Process questions in batches
    for start in range(0, len(unanswered_df), batch_size):
        batch = unanswered_df.iloc[start : start + batch_size]

        queries = batch["Question Body"].tolist()

        # Retrieve context in batch
        contexts = [" ".join(docs) for docs in retrieve_documents(queries, k=3)]

        # Generate responses in batch
        responses = generate_responses(queries, contexts)

        # Update the DataFrame with responses
        df.loc[batch.index, response_col_name] = responses
        df.loc[batch.index, context_col_name] = contexts
        print(f"Processed {start + len(batch)} / {len(unanswered_df)} questions")

    # Save the updated CSV file with responses
    df.to_csv(file_path, index=False, encoding="utf-8")
    print(f"All questions processed and saved back to {file_path}")

# Run batch processing
file_path = "./dataset/test.csv"  # Replace with your CSV file path
process_questions(file_path, batch_size=10)
