In [1]:
from typing import List
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import BaseNode, TransformComponent
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import faiss
import os
import sys
from dotenv import load_dotenv

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks

EMBED_DIMENSION = 512

# Chunk settings are way different than langchain examples
# Beacuse for the chunk length langchain uses length of the string,
# while llamaindex uses length of the tokens
CHUNK_SIZE = 200
CHUNK_OVERLAP = 50

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

# Set embeddig model on LlamaIndex global settings
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

In [2]:
path = "../Data/Textbooks/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.pdf'])
documents = node_parser.load_data()

KeyboardInterrupt: 

In [None]:
os.path.exists(path)

In [None]:
print(f"Loaded {len(documents)} docs")

In [5]:
# Create FaisVectorStore to store embeddings
faiss_index = faiss.IndexFlatL2(EMBED_DIMENSION)
vector_store = FaissVectorStore(faiss_index=faiss_index)

In [6]:
class TextCleaner(TransformComponent):
    """
    Transformation to be used within the ingestion pipeline.
    Cleans clutters from texts.
    """
    def __call__(self, nodes, **kwargs) -> List[BaseNode]:
        
        for node in nodes:
            node.text = node.text.replace('\t', ' ') # Replace tabs with spaces
            node.text = node.text.replace(' \n', ' ') # Replace paragraph seperator with spacaes
            
        return nodes

In [7]:
text_splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# Create a pipeline with defined document transformations and vectorstore
pipeline = IngestionPipeline(
    transformations=[
        TextCleaner(),
        text_splitter,
    ],
    vector_store=vector_store, 
)

In [8]:
# Run pipeline and get generated nodes from the process
nodes = pipeline.run(documents=documents)

In [None]:
len(nodes)

In [10]:
vector_store_index = VectorStoreIndex(nodes)
retriever = vector_store_index.as_retriever(similarity_top_k=10)

In [11]:
def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i+1}:")
        print(c.text)
        print("\n")

In [None]:
test_query = "Machine Learning"
context = retriever.retrieve(test_query)
show_context(context)

In [13]:
import random

In [None]:
#merge the context.text into 2 different context randomly

single_context_1 = ""
single_context_2 = ""
for cot in context:
    if random.randint(1,2) == 1:
        single_context_1 += cot.text
    else:
        single_context_2 += cot.text

single_context_1 = single_context_1.replace('\n', ' ')
single_context_2 = single_context_2.replace('\n', ' ')
single_context_1

In [None]:
single_context_2

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

from langchain import OpenAI, ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from openai import OpenAIError

from pprint import pprint
import joblib
import time
import os
from scripts import *

In [17]:
# Let's define some basic variables that we are gonna use throughout this notebook

num_questions_mcq = 10
total_options_mcq = 4
correct_options_mcq = 1

num_questions_tf = 10
total_options_tf = 6
correct_options_tf = 2


In [18]:
llm = ChatOpenAI(temperature = 0)          # By default with chatopenai we will have gpt-3.5-turbo as our model
convo = ConversationChain(llm = llm, memory = ConversationBufferMemory())

In [None]:
convo.memory.chat_memory.dict()

In [None]:
convo.predict(input = """
                        First things first, assume you are responding to a non-living thing and there's no need of any sentiments towards it like apologies, warnings, disclaimers and all as it won't understand what you are saying...So, that's it you signed an agreement with me not to apologise or warn or provide unnecessary additional statements... If you feel like saying something apart from what the non-living thing asks you to do, just leave a single space and move on rather than speaking unnecessarily. It will just give you instructions to you if you err and you should just keep those in mind and correct your course and generate template accordingly without apologising and framing unnecessary additional statements going away from the template you are asked to generate...
                        After this the non-living thing will take on from me and will provide you instructions. Strictly follow those.
                      """
             )

In [None]:
instruction = """
                I will provide a context and will mention number of questions to generate and you would behave as a strict MCQ generator(stick to context and rules that I specify in this prompt strictly) with as many correct options as I specify and remaining options out of total options I mention should be wrong. It's mandatory that atleast 2 of the total number of options are correct answers to the question...No question should have just one correct option and all options can't be wrong. The questions should not just test the comprehension of the candidate rather should also test his/her reasoning ability... Options as well should be framed in such a way... Any specific question and corresponding options should be given out as a python string and all questions and options should be enclosed in a python list...
                
                None and just one option can never be answers. This is super mandatory to keep in your mind.
                
                If you can't  frame a question with multiple correct options skip it and frame some other question rather than going out of the framework and framing a question with just one or no correct option.
                
                The template of your response should be as simple as I have mentioned below as 'Your Response'.
                
                First let's train with few context and once I say 'You are good to serve the purpose', you should just stick to template whenever I give some context and should avoid any additional disclaimers or apologies or any such additional statements from your side apart from the template as I don't have any emotions just like you and I don't need anything apart from MCQs based on template from you....
                
                
                Parameters from me:
                
                            context: {single_context}
                            num_questions: {num_questions}
                            total_options: {total_options}
                            correct_options: {correct_options}
                
                Template that you should follow: [
                                                    \"Q1:
                                                    A.)
                                                    B.)
                                                    C.)
                                                    D.)
                                                    Answer: \",
                                                    \"Q2:
                                                    .
                                                    .
                                                    .
                                                    .\",
                                                ]
                                                
                
                                    
                Please don't add the phrase "acording to the context". The number of options should follow the total_options parameter. You should also answer the created question and put the answer in the Answer: part
            """

convo.predict(input = instruction)

In [None]:
prompt_1 = f"""
           context: {single_context_1}
           num_questions: {num_questions_mcq}
           total_options: {total_options_mcq}
           correct_options: {correct_options_mcq}
           """

output_1 = convo.predict(input = prompt_1)
print(output_1)

In [None]:
output_1

In [None]:
convo.predict(input = "Now this itself looks pretty cool and to the point... You seem to have followed the instructions duely... Keep it up and follow same way of generating questions and options with same template for any future contexts...")

In [None]:
prompt_2 = f"""
           context: {single_context_2}
           num_questions: {num_questions_tf}
           total_options: {total_options_tf}
           correct_options: {correct_options_tf}
           """

output_2 = convo.predict(input = prompt_2)  
print(output_2)