# MODEL DEVELOPMENT

In [1]:
# Importing Required Libraries

# Fundamental Libraries
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader

# Langchain Libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

# Miscellaneous
import textwrap
from IPython.display import Markdown
import streamlit as st

  from .autonotebook import tqdm as notebook_tqdm


## Establishing Connection With Model

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [3]:
# A Helper Function - Converts the input text into a Markdown blockquote format.
def to_markdown(text):
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _:True))

In [4]:
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content("Explain how AI works")

to_markdown(response.text)

> Artificial intelligence (AI) is a broad field encompassing many techniques, but at its core, it's about creating systems that can perform tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, and natural language understanding.  There's no single "how it works," but rather a collection of approaches, broadly categorized as follows:
> 
> **1. Machine Learning (ML):** This is the most prevalent approach currently.  Instead of explicitly programming a computer to perform a task, ML involves feeding it vast amounts of data and letting it learn patterns and relationships from that data.  The system adjusts its internal parameters (weights and biases) to improve its performance over time.  Key concepts include:
> 
> * **Supervised Learning:** The algorithm is trained on a labeled dataset – meaning the data is already tagged with the correct answers.  Examples include image classification (images labeled with the objects they contain) and spam detection (emails labeled as spam or not spam).
> * **Unsupervised Learning:** The algorithm is trained on an unlabeled dataset and tries to find patterns and structures within the data itself.  Examples include clustering (grouping similar data points together) and dimensionality reduction (reducing the number of variables while retaining important information).
> * **Reinforcement Learning:** The algorithm learns through trial and error by interacting with an environment. It receives rewards for good actions and penalties for bad actions, learning to optimize its behavior to maximize rewards.  Examples include game playing (AlphaGo) and robotics.
> 
> **2. Deep Learning (DL):** A subfield of ML that uses artificial neural networks with multiple layers (hence "deep").  These networks can learn highly complex patterns from data, making them particularly effective for tasks like image recognition, natural language processing, and speech recognition.  The "deep" refers to the many layers of interconnected nodes that process information in a hierarchical fashion, extracting increasingly abstract features from the raw data.
> 
> **3. Expert Systems:** These systems mimic the decision-making ability of a human expert in a specific domain.  They use a knowledge base of rules and facts to infer conclusions and provide recommendations. While less common now than ML/DL, they are still used in niche applications.
> 
> **4. Natural Language Processing (NLP):** Focuses on enabling computers to understand, interpret, and generate human language.  This includes tasks like machine translation, text summarization, sentiment analysis, and chatbots.  Often relies heavily on deep learning techniques.
> 
> **5. Computer Vision:**  Enables computers to "see" and interpret images and videos.  This includes object detection, image segmentation, and facial recognition.  Deep learning has revolutionized this field.
> 
> 
> **In essence, most AI systems work by:**
> 
> 1. **Data Acquisition:** Gathering large amounts of relevant data.
> 2. **Data Preprocessing:** Cleaning and preparing the data for the chosen algorithm.
> 3. **Model Selection:** Choosing an appropriate algorithm (e.g., neural network, decision tree).
> 4. **Training:** Feeding the data to the algorithm to learn patterns.
> 5. **Evaluation:** Testing the model's performance on unseen data.
> 6. **Deployment:** Using the trained model to make predictions or decisions on new data.
> 
> 
> It's important to note that AI is still under development, and many current systems are narrow or weak AI – meaning they are designed for specific tasks and lack the general intelligence of humans.  The pursuit of artificial general intelligence (AGI), a system with human-level intelligence across various domains, remains a major goal of the field.


## Data Preprocessing

In [5]:
# 1. Extracting text from PDFs
text = ""
file_path = '../dataset/Speech & Language Processing.pdf'
with open(file_path, 'rb') as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
        text += page.extract_text()

In [6]:
# 2. Converting extracted data into data chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text_chunks = text_splitter.split_text(text)

In [7]:
print(len(text_chunks))
text_chunks[500:505]

2040


['Language is an inherently temporal phenomenon. Spoken language is a sequence of\nacoustic events over time, and we comprehend and produce both spoken and written\nlanguage as a sequential input stream. The temporal nature of language is reﬂected\nin the metaphors we use; we talk of the ﬂow of conversations ,news feeds , and twitter\nstreams , all of which emphasize that language is a sequence that unfolds in time.\nThis temporal nature is reﬂected in some language processing algorithms. For\nexample, the Viterbi algorithm we introduced for HMM part-of-speech tagging pro-\nceeds through the input a word at a time, carrying forward information gleaned along\nthe way. But other machine learning approaches, like those we’ve studied for senti-\nment analysis or other text classiﬁcation tasks don’t have this temporal nature – they\nassume simultaneous access to all aspects of their input.\nThe feedforward networks of Chapter 7 also assumed simultaneous access, al-',
 'The feedforward netwo

In [8]:
# 3. Embedding text chunks into vector format
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local('faiss_index')

In [9]:
# 4. Building the conversation chain
prompt_template = """
Answer the question as detailed as possible from the provided context and make sure to provide all the details.
If the answer is not available in the provided context just say, "Answer Not Available In Given Context", DO NOT provide the wrong answer.
Context : \n{context}?\n
Question : \n{question}\n

Answer : 
"""
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash', temperature=0.3)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type='stuff', prompt=prompt)

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(model, chain_type='stuff', prompt=prompt)


## Output Generation

In [10]:
# 5. Processing User Input
user_question = "What is Large Language Model"

embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
new_db = FAISS.load_local('faiss_index', embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)

response = chain(
    {'input_documents': docs, 'question': user_question},
    return_only_outputs=True
)

print("Response:", response['output_text'])

  response = chain(


Response: Based on the provided text, a Large Language Model (LLM) is a type of pretrained language model that learns knowledge about language and the world from vast amounts of text data (hundreds of billions of words, generally scraped from the web).  These models are built using transformers and are particularly powerful because they can be used to address many Natural Language Processing (NLP) tasks by framing them as word prediction problems.  The ability to incorporate the entirety of the earlier context and generated outputs at each step is key to their power.  LLMs exhibit remarkable performance on various NLP tasks, especially those involving text generation, such as summarization, machine translation, question answering, and chatbots.  The training data used for LLMs requires careful filtering for quality and balancing across domains, and transparency regarding this data is increasingly important due to fair use concerns and government regulations.



In [11]:
docs

[Document(metadata={}, page_content='sight of large language modeling is that many practical NLP tasks can be cast as\nword prediction , and that a powerful-enough language model can solve them with\na high degree of accuracy. For example, we can cast sentiment analysis as language\nmodeling by giving a language model a context like:\nThe sentiment of the sentence ``I like Jackie Chan" is:\nand comparing the following conditional probability of the words “positive” and the10.1 • L ARGE LANGUAGE MODELS WITH TRANSFORMERS 205\nPreﬁx TextCompletion Text\nEncoderTransformerBlocksSoftmax\nlongall\nandthanksforallthe\nthe…UUUnencoder layerLanguage ModelingHeadlogits\nSo\nEi+\nEi+\nEi+\nEi+\nEi+\nEi+\nEi+…\nFigure 10.1 Left-to-right (also called autoregressive) text completion with transformer-based large language\nmodels. As each token is generated, it gets added onto the context as a preﬁx for generating the next token.\nword “negative” to see which is higher:\nP(positivejThe sentiment of th

# RAG TESTING

In [12]:
# Importing Essential Libraries

import pandas as pd
from giskard.rag import KnowledgeBase, generate_testset
from giskard.rag import evaluate
import giskard

In [13]:
# Setting Up required configurations
giskard.llm.set_llm_model("gemini/gemini-1.5-flash")
giskard.llm.set_embedding_model("gemini/text-embedding-004")
os.environ["GEMINI_API_KEY"] = GOOGLE_API_KEY

In [14]:
df = pd.DataFrame([chunk for chunk in text_chunks], columns=['text'])

# Let's reduce the testing size for faster calculation
df = df[100:250]
knowledge_base = KnowledgeBase(df)

df.head()

Unnamed: 0,text
100,1 and substitutions are not allowed. (This is ...
101,we saw it. We can do this by using dynamic pro...
102,n t e n t i o ni n t e n t i o n\ne t e n t i ...
103,D[i;j]as the edit distance between X[1::i]andY...
104,We mentioned above two versions of Levenshtein...


In [15]:
testset = generate_testset(
    knowledge_base,
    num_questions = 5,
    agent_description = "A chatbot that answers questions based on the content of a given PDF."
)

2024-12-09 03:09:46,128 pid:15454 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


2024-12-09 03:09:59,618 pid:15454 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


Generating questions: 100%|██████████| 5/5 [00:11<00:00,  2.37s/it]


In [16]:
testset.save("../dataset/test_set.jsonl")

testset_df = testset.to_pandas()
for index, row in enumerate(testset_df.head(1).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")

Question 1: What is the solution for unknown words that did not appear in the training documents?
Reference answer: The solution for such unknown words is to ignore them—remove them from the test unknown word document and not include any probability for them at all.
Reference context:
Document 211: ulary at all because they did not occur in any training document in any class? The
solution for such unknown words is to ignore them—remove them from the test unknown word
document and not include any probability for them at all.
Finally, some systems choose to completely ignore another class of words: stop
words , very frequent words like theanda. This can be done by sorting the vocabu- stop words
lary by frequency in the training set, and deﬁning the top 10–100 vocabulary entries
as stop words, or alternatively by using one of the many predeﬁned stop word lists
available online. Then each instance of these stop words is simply removed from
both training and test documents as if it had neve

In [17]:
def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context and make sure to provide all the details.
    If the answer is not available in the provided context just say, "Answer Not Available In Given Context", DO NOT provide the wrong answer.
    Context : \n{context}?\n
    Question : \n{question}\n

    Answer : 
    """
    model = ChatGoogleGenerativeAI(model='gemini-1.5-pro', temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type='stuff', prompt=prompt)
    return chain

In [18]:
def answer_fn(question, history=None):
    # If history is not needed, we can ignore it. However, if you want to implement conversation memory, this can be added.
    
    embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    new_db = FAISS.load_local('faiss_index', embeddings, allow_dangerous_deserialization=True)

    docs = new_db.similarity_search(question)
    chain = get_conversational_chain()
    
    response = chain(
        {'input_documents': docs, 'question': question},
        return_only_outputs=True
    )
    
    answer = response['output_text']
    
    return answer

In [19]:
# Generating RAG evaluation report
report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent:  80%|████████  | 4/5 [00:12<00:02,  2.90s/it]



Asking questions to the agent: 100%|██████████| 5/5 [00:17<00:00,  3.52s/it]
CorrectnessMetric evaluation: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


In [20]:
display(report)