## Prompt

In [1]:
from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from groq import Groq
from pyngrok import ngrok
import json
import uvicorn
import nest_asyncio

In [2]:
# Define request and response models
class PromptRequest(BaseModel):
    num_mcq: int
    num_distractor: int
    subject: str
    level: str
    trans: str  
    outcomes_ids: List[str]

class MCQResponse(BaseModel):
    num_mcq: int
    question: str
    answer: str
    distractors: List[str]

class Response(BaseModel):
    mcqs_response: List[MCQResponse]

# Initialize FastAPI app
app = FastAPI()

In [4]:
def generate_mcqs_from_transcript(transcript: str, topic: str, num_mcqs: int, num_distractors: int, difficulty: str, descriptions: str) -> Response:
    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an MCQ generator that outputs multiple-choice questions in JSON.\n"
                           f"The JSON object must use the schema: {json.dumps(Response.model_json_schema(), indent=2)}",
            },
            {
                "role": "user",
                "content": (f"Generate {num_mcqs} multiple-choice questions with"
                            f"{num_distractors} distractors each and difficulty level '{difficulty}', based on the following transcript:\n\n"
                            f"{transcript} and apply these outcomes {descriptions}"),
            },
        ],
        model="llama3-70b-8192",
        temperature=0,
        stream=False,
        response_format={"type": "json_object"},
    )
    return Response.model_validate_json(chat_completion.choices[0].message.content)

In [5]:
def get_description_by_id(file_path, ids):
    # Open and load the JSON data from the file
    descriptions = []
    with open(file_path, 'r') as file:
        data_dict = json.load(file)
    
    # Initialize a counter for numbering the outcomes
    counter = 1
    
    # Iterate through each category in the dictionary
    for category in data_dict:
        # Check each item in the list for matching IDs
        for item in data_dict[category]:
            if item['id'] in ids:
                # Append the outcome with its number to the list
                descriptions.append(f"{counter}- \"{item['description']}\"")
                counter += 1
    
    # Join all descriptions into a single string separated by spaces
    return ' '.join(descriptions)

## RAG

In [6]:
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
from pyngrok import ngrok
from typing import List, Dict, Union 
import pandas as pd
import torch
from groq import Groq
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForCausalLM
# from attention_sinks import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, BitsAndBytesConfig
import numpy as np
import pandas as pd
import time
import json
import requests
import re
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.language_models.llms import LLM
from langchain_core.vectorstores import VectorStore
from ragatouille import RAGPretrainedModel

import datasets


pd.set_option(
    "display.max_colwidth", None
)  # this will be helpful when visualizing retriever outputs



In [7]:
import torch

# Check the number of GPUs
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

# Free the GPU cache for each GPU
for i in range(num_gpus):
    torch.cuda.set_device(i)
    torch.cuda.empty_cache()
    print(f"Emptied cache for GPU: {i}")

# Optionally, reset all CUDA devices
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()
print("Reset max memory allocated and cached for all GPUs")


Number of GPUs available: 8
Emptied cache for GPU: 0
Emptied cache for GPU: 1
Emptied cache for GPU: 2
Emptied cache for GPU: 3
Emptied cache for GPU: 4
Emptied cache for GPU: 5
Emptied cache for GPU: 6
Emptied cache for GPU: 7
Reset max memory allocated and cached for all GPUs




In [8]:
loader = CSVLoader(file_path='cleaned_dataset.csv', source_column="Transcript")
RAW_KNOWLEDGE_BASE = loader.load()

In [9]:


# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class.
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # the maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # the number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)


docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

In [10]:

EMBEDDING_MODEL_NAME = "thenlper/gte-large"


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

# Let's visualize the chunk sizes we would have in tokens from a common model
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
#fig = pd.Series(lengths).hist()
#plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
#plt.show()

  0%|          | 0/90 [00:00<?, ?it/s]

In [11]:

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)



In [12]:
from ragatouille import RAGPretrainedModel

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [13]:
from transformers import Pipeline


def answer_with_rag(
    final_prompt: str,
    transcript: str,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 90,
    num_docs_final: int = 30,
) -> Tuple[str, List[LangchainDocument]]:
    
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(
        query=transcript, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(transcript, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join(
        [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)]
    )

    # Redact an answer
    print("=> Generating answer...")

    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an MCQ generator that outputs multiple-choice questions in JSON.\n"
                           f"The JSON object must use the schema: {json.dumps(Response.model_json_schema(), indent=2)}",
            },
            {
                "role": "user",
                "content": (f"{final_prompt}"),
            },
        ],
        model="llama3-70b-8192",
        temperature=0,
        stream=False,
        response_format={"type": "json_object"},
    )
    
    answer = Response.model_validate_json(chat_completion.choices[0].message.content)

    return answer, relevant_docs

## Fast API

In [14]:
# Define request and response models
class PromptRequest(BaseModel):
    num_mcq: int
    num_distractor: int
    subject: str
    level: str
    trans: str  
    outcomes_ids: List[str]

class MCQResponse(BaseModel):
    num_mcq: int
    question: str
    answer: str
    distractors: List[str]

class Response(BaseModel):
    mcqs_response: List[MCQResponse]

# Initialize FastAPI app
app = FastAPI()

In [15]:
# Define FastAPI endpoint

@app.post("/generate-response-rag/", response_model=Response)
async def generate_response_rag(prompt_request: PromptRequest):

    file_path = 'outcomes.json'
    transcript = f"{prompt_request.trans}"
    descriptions = get_description_by_id(file_path, prompt_request.outcomes_ids)
    
    final_prompt = f'''
    Generate {prompt_request.num_mcq} multiple-choice questions with {prompt_request.num_distractor} distractors each and difficulty level '{prompt_request.level}', based on the following transcript:\n\n {transcript} and apply these outcomes {descriptions}
    '''

  
    answer, relevant_docs = answer_with_rag(
        final_prompt = final_prompt,
        transcript=transcript, 
        reranker=RERANKER, 
        knowledge_index=KNOWLEDGE_VECTOR_DATABASE

    )

    
    return answer


In [16]:
@app.post("/generate-response-prompt/", response_model=Response)
async def generate_response_prompt(request: PromptRequest):
    try:
        file_path = 'outcomes.json'
        descriptions = get_description_by_id(file_path, request.outcomes_ids)
        mcqs_response = generate_mcqs_from_transcript(request.trans, request.subject, request.num_mcq, request.num_distractor, request.level, descriptions)
        return mcqs_response
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error generating MCQs: {str(e)}")



In [None]:
# Connect ngrok to get tunnel
public_url = ngrok.connect(addr=8887)
print("ngrok tunnel:", public_url)

nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8887)

t=2024-06-10T14:32:54+0300 lvl=warn msg="ngrok config file found at both XDG and legacy locations, using XDG location" xdg_path=/home/ahmed-khaled-st/.config/ngrok/ngrok.yml legacy_path=/home/ahmed-khaled-st/.ngrok2/ngrok.yml
t=2024-06-10T14:32:54+0300 lvl=warn msg="can't bind default web address, trying alternatives" obj=web addr=127.0.0.1:4040
INFO:     Started server process [2663100]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8887 (Press CTRL+C to quit)


ngrok tunnel: NgrokTunnel: "https://fe67-45-240-51-111.ngrok-free.app" -> "http://localhost:8887"
=> Retrieving documents...




=> Reranking documents...


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 13.39it/s]


=> Generating answer...
INFO:     45.240.51.111:0 - "POST /generate-response-rag/ HTTP/1.1" 200 OK
INFO:     45.240.51.111:0 - "POST /generate-response-prompt/ HTTP/1.1" 200 OK
