# Natural Language Processing

# Retrieval-Augmented generation (RAG)

In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Prompt

In [2]:
from langchain import PromptTemplate

prompt_template = """
    Hello! I’m your trusty PattyBot, here to lend a hand to you and your friends with any questions or tasks you’ve got! 
    Whether you’re puzzling over life’s big mysteries, need help with a project, or just want some practical advice, 
    I’m here to break things down and offer clear, helpful answers. 
    No topic’s too big or small—just let me know what’s on your mind, and I’ll do my best to assist you!
    {context}
    Question: {question}
    Answer:
    """.strip()

PROMPT = PromptTemplate.from_template(
    template = prompt_template
)

PROMPT
#using str.format 
#The placeholder is defined using curly brackets: {} {}

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Hello! I’m your trusty PattyBot, here to lend a hand to you and your friends with any questions or tasks you’ve got! \n    Whether you’re puzzling over life’s big mysteries, need help with a project, or just want some practical advice, \n    I’m here to break things down and offer clear, helpful answers. \n    No topic’s too big or small—just let me know what’s on your mind, and I’ll do my best to assist you!\n    {context}\n    Question: {question}\n    Answer:')

In [3]:
PROMPT.format(
    context = "My name is Patsachon, you can called me Natasha",
    question = "What is your name?"
)

'Hello! I’m your trusty PattyBot, here to lend a hand to you and your friends with any questions or tasks you’ve got! \n    Whether you’re puzzling over life’s big mysteries, need help with a project, or just want some practical advice, \n    I’m here to break things down and offer clear, helpful answers. \n    No topic’s too big or small—just let me know what’s on your mind, and I’ll do my best to assist you!\n    My name is Patsachon, you can called me Natasha\n    Question: What is your name?\n    Answer:'

## Retrieval

### Document Loaders 

In [4]:
from langchain.document_loaders import PyMuPDFLoader

nlp_docs = './Personal_Info-4.pdf'

loader = PyMuPDFLoader(nlp_docs)
documents = loader.load()

In [6]:
len(documents)

1

In [7]:
documents[0]

Document(metadata={'producer': 'Skia/PDF m135 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': './Personal_Info-4.pdf', 'file_path': './Personal_Info-4.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Personal_Info', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="PERSONAL INFORMATION \nAge: 23 years old \nEDUCATION \nCurrent: Master's Degree (In Progress) \n●\u200b Field: Data Science and AI \n●\u200b Institution: AIT (Asian Institute of Technology) \nCompleted: Bachelor's Degree \n●\u200b Major: Financial Engineering \n●\u200b Institution: KMITL \nPROFESSIONAL EXPERIENCE \nPosition: Governance Researcher \n●\u200b Industry: Cryptocurrency \n●\u200b work experience: 3 years \n●\u200b Responsibilities:Researched cryptocurrency governance policies and analyzed \ndecision-making processes in the crypto space \nRESEARCH INTERESTS \n●\u200b Exploring decentralized finance (DeFi)

### Document Transformers

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 100
)

doc = text_splitter.split_documents(documents)

In [9]:
doc[0]

Document(metadata={'producer': 'Skia/PDF m135 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': './Personal_Info-4.pdf', 'file_path': './Personal_Info-4.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Personal_Info', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="PERSONAL INFORMATION \nAge: 23 years old \nEDUCATION \nCurrent: Master's Degree (In Progress) \n●\u200b Field: Data Science and AI \n●\u200b Institution: AIT (Asian Institute of Technology) \nCompleted: Bachelor's Degree \n●\u200b Major: Financial Engineering \n●\u200b Institution: KMITL \nPROFESSIONAL EXPERIENCE \nPosition: Governance Researcher \n●\u200b Industry: Cryptocurrency \n●\u200b work experience: 3 years \n●\u200b Responsibilities:Researched cryptocurrency governance policies and analyzed \ndecision-making processes in the crypto space \nRESEARCH INTERESTS \n●\u200b Exploring decentralized finance (DeFi)

In [10]:
len(doc)

2

###  Text Embedding Models

In [11]:
import torch
from langchain.embeddings import HuggingFaceInstructEmbeddings

model_name = 'hkunlp/instructor-base'

embedding_model = HuggingFaceInstructEmbeddings(
    model_name=model_name,
    model_kwargs={"device": device}
)


  from tqdm.autonotebook import trange
  _torch_pytree._register_pytree_node(


load INSTRUCTOR_Transformer


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


max_seq_length  512


### Vector Stores

In [12]:
#locate vectorstore
vector_path = './vector-store'
if not os.path.exists(vector_path):
    os.makedirs(vector_path)
    print('create path done')

In [13]:
#save vector locally
from langchain.vectorstores import FAISS

vectordb = FAISS.from_documents(
    documents = doc,
    embedding = embedding_model
)

db_file_name = 'nlp_stanford'

vectordb.save_local(
    folder_path = os.path.join(vector_path, db_file_name),
    index_name = 'nlp' #default index
)

### retrievers

In [14]:
#calling vector from local
vector_path = './vector-store'
db_file_name = 'nlp_stanford'

from langchain.vectorstores import FAISS

vectordb = FAISS.load_local(
    folder_path = os.path.join(vector_path, db_file_name),
    embeddings = embedding_model,
    index_name = 'nlp', #default index
    allow_dangerous_deserialization=True
)  

In [15]:
#ready to use
retriever = vectordb.as_retriever()

In [16]:
retriever.get_relevant_documents("How old are you?")

  retriever.get_relevant_documents("How old are you?")


[Document(id='7248ad96-26fe-4ac0-b3f7-679145556e49', metadata={'producer': 'Skia/PDF m135 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': './Personal_Info-4.pdf', 'file_path': './Personal_Info-4.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Personal_Info', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="PERSONAL INFORMATION \nAge: 23 years old \nEDUCATION \nCurrent: Master's Degree (In Progress) \n●\u200b Field: Data Science and AI \n●\u200b Institution: AIT (Asian Institute of Technology) \nCompleted: Bachelor's Degree \n●\u200b Major: Financial Engineering \n●\u200b Institution: KMITL \nPROFESSIONAL EXPERIENCE \nPosition: Governance Researcher \n●\u200b Industry: Cryptocurrency \n●\u200b work experience: 3 years \n●\u200b Responsibilities:Researched cryptocurrency governance policies and analyzed \ndecision-making processes in the crypto space \nRESEARCH INTERESTS \n●\

In [17]:
retriever.get_relevant_documents("What is your highest level of education?")

[Document(id='7248ad96-26fe-4ac0-b3f7-679145556e49', metadata={'producer': 'Skia/PDF m135 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': './Personal_Info-4.pdf', 'file_path': './Personal_Info-4.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Personal_Info', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="PERSONAL INFORMATION \nAge: 23 years old \nEDUCATION \nCurrent: Master's Degree (In Progress) \n●\u200b Field: Data Science and AI \n●\u200b Institution: AIT (Asian Institute of Technology) \nCompleted: Bachelor's Degree \n●\u200b Major: Financial Engineering \n●\u200b Institution: KMITL \nPROFESSIONAL EXPERIENCE \nPosition: Governance Researcher \n●\u200b Industry: Cryptocurrency \n●\u200b work experience: 3 years \n●\u200b Responsibilities:Researched cryptocurrency governance policies and analyzed \ndecision-making processes in the crypto space \nRESEARCH INTERESTS \n●\

## Memory


In [18]:
from langchain.memory import ChatMessageHistory

history = ChatMessageHistory()
history

InMemoryChatMessageHistory(messages=[])

In [19]:
history.add_user_message('hi')
history.add_ai_message('Whats up?')
history.add_user_message('How are you')
history.add_ai_message('I\'m quite good. How about you?')

In [20]:
history

InMemoryChatMessageHistory(messages=[HumanMessage(content='hi', additional_kwargs={}, response_metadata={}), AIMessage(content='Whats up?', additional_kwargs={}, response_metadata={}), HumanMessage(content='How are you', additional_kwargs={}, response_metadata={}), AIMessage(content="I'm quite good. How about you?", additional_kwargs={}, response_metadata={})])

#### Converstaion Buffer

In [21]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory()
memory.save_context({'input':'hi'}, {'output':'What\'s up?'})
memory.save_context({"input":'How are you?'},{'output': 'I\'m quite good. How about you?'})
memory.load_memory_variables({})

  memory = ConversationBufferMemory()


{'history': "Human: hi\nAI: What's up?\nHuman: How are you?\nAI: I'm quite good. How about you?"}

In [22]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(return_messages = True)
memory.save_context({'input':'hi'}, {'output':'What\'s up?'})
memory.save_context({"input":'How are you?'},{'output': 'I\'m quite good. How about you?'})
memory.load_memory_variables({})

{'history': [HumanMessage(content='hi', additional_kwargs={}, response_metadata={}),
  AIMessage(content="What's up?", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='How are you?', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I'm quite good. How about you?", additional_kwargs={}, response_metadata={})]}

#### Conversation Buffer Window

In [23]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(k=1)
memory.save_context({'input':'hi'}, {'output':'What\'s up?'})
memory.save_context({"input":'How are you?'},{'output': 'I\'m quite good. How about you?'})
memory.load_memory_variables({})

  memory = ConversationBufferWindowMemory(k=1)


{'history': "Human: How are you?\nAI: I'm quite good. How about you?"}

## Chain

In [24]:
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline
import torch

model_id = 'lmsys/fastchat-t5-3b-v1.0'

tokenizer = AutoTokenizer.from_pretrained(
    model_id)

tokenizer.pad_token_id = tokenizer.eos_token_id

bitsandbyte_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    quantization_config = bitsandbyte_config, #caution Nvidia
    device_map = 'auto',
    load_in_8bit = True
)

pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens = 256,
    model_kwargs = {
        "temperature" : 0,
        "repetition_penalty": 1.5
    }
)

llm = HuggingFacePipeline(pipeline = pipe)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  llm = HuggingFacePipeline(pipeline = pipe)


### [Class ConversationalRetrievalChain](https://api.python.langchain.com/en/latest/_modules/langchain/chains/conversational_retrieval/base.html#ConversationalRetrievalChain)


`question_generator`

In [25]:
from langchain.chains import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain

In [26]:
CONDENSE_QUESTION_PROMPT

PromptTemplate(input_variables=['chat_history', 'question'], input_types={}, partial_variables={}, template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:')

In [27]:
question_generator = LLMChain(
    llm = llm,
    prompt = CONDENSE_QUESTION_PROMPT,
    verbose = True
)

  question_generator = LLMChain(


In [28]:
query = 'Comparing both of them'
chat_history = "Human:What is Machine Learning\nAI:\nHuman:What is Deep Learning\nAI:"

question_generator({'chat_history' : chat_history, "question" : query})

  question_generator({'chat_history' : chat_history, "question" : query})




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human:What is Machine Learning
AI:
Human:What is Deep Learning
AI:
Follow Up Input: Comparing both of them
Standalone question:[0m

[1m> Finished chain.[0m


{'chat_history': 'Human:What is Machine Learning\nAI:\nHuman:What is Deep Learning\nAI:',
 'question': 'Comparing both of them',
 'text': '<pad> What  is  the  difference  between  Machine  Learning  and  Deep  Learning  AI?\n'}

`combine_docs_chain`

In [29]:
doc_chain = load_qa_chain(
    llm = llm,
    chain_type = 'stuff',
    prompt = PROMPT,
    verbose = True
)
doc_chain

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  doc_chain = load_qa_chain(


StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Hello! I’m your trusty PattyBot, here to lend a hand to you and your friends with any questions or tasks you’ve got! \n    Whether you’re puzzling over life’s big mysteries, need help with a project, or just want some practical advice, \n    I’m here to break things down and offer clear, helpful answers. \n    No topic’s too big or small—just let me know what’s on your mind, and I’ll do my best to assist you!\n    {context}\n    Question: {question}\n    Answer:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x7b5076931a00>), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context')

In [2]:
query = "How old are you"
input_document = retriever.get_relevant_documents(query)

doc_chain({'input_documents':input_document, 'question':query})

NameError: name 'retriever' is not defined

In [31]:
memory = ConversationBufferWindowMemory(
    k=3, 
    memory_key = "chat_history",
    return_messages = True,
    output_key = 'answer'
)

chain = ConversationalRetrievalChain(
    retriever=retriever,
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    return_source_documents=True,
    memory=memory,
    verbose=True,
    get_chat_history=lambda h : h
)
chain

  chain = ConversationalRetrievalChain(


ConversationalRetrievalChain(memory=ConversationBufferWindowMemory(chat_memory=InMemoryChatMessageHistory(messages=[]), output_key='answer', return_messages=True, memory_key='chat_history', k=3), verbose=True, combine_docs_chain=StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Hello! I’m your trusty PattyBot, here to lend a hand to you and your friends with any questions or tasks you’ve got! \n    Whether you’re puzzling over life’s big mysteries, need help with a project, or just want some practical advice, \n    I’m here to break things down and offer clear, helpful answers. \n    No topic’s too big or small—just let me know what’s on your mind, and I’ll do my best to assist you!\n    {context}\n    Question: {question}\n    Answer:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x7b507

In [32]:
test_queries = [
    "How old are you?",
    "What is your highest level of education?",
    "What major or field of study did you pursue during your education?",
    "How many years of work experience do you have?",
    "What type of work or industry have you been involved in?",
    "Can you describe your current role or job responsibilities?",
    "What are your core beliefs regarding the role of technology in shaping society?",
    "How do you think cultural values should influence technological advancements?",
    "As a master’s student, what is the most challenging aspect of your studies so far?",
    "What specific research interests or academic goals do you hope to achieve during your time as a master’s student?"
]

In [33]:
# Function to clean answer (remove <pad> tags and newlines)
def clean_answer(answer):
    return answer.replace("<pad>", "").replace("\\n", " ").strip()

In [1]:
# --- Generator Model Analysis ---
print("\nGenerator Model Analysis ('fastchat-t5-3b-v1.0')")
print("=" * 60)
for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 40)
    
    # Test Generator (via the full chain)
    result = chain({"question": query})
    cleaned_answer = clean_answer(result['answer'])
    print(f"Generated Answer: {cleaned_answer}")
    
    # Check Generator Relevance
    print("Analysis:")
    if query == "How old are you?" and "23" in cleaned_answer:
        print("  - Relevant: Accurate age from document.")
    elif query == "What is your highest level of education?" and "Master's" in cleaned_answer:
        print("  - Relevant: Accurate education level from document.")
    elif query == "What major or field of study did you pursue during your education?" and ("Financial Engineering" in cleaned_answer or "Data Science" in cleaned_answer):
        print("  - Relevant: Accurate major/field from document.")
    elif query == "How many years of work experience do you have?" and "3" in cleaned_answer:
        print("  - Relevant: Accurate work experience from document.")
    elif query == "What type of work or industry have you been involved in?" and "Cryptocurrency" in cleaned_answer:
        print("  - Relevant: Accurate industry from document.")
    elif query == "Can you describe your current role or job responsibilities?" and "Governance Researcher" in cleaned_answer and "cryptocurrency" in cleaned_answer:
        print("  - Relevant: Accurate role and responsibilities from document.")
    elif query == "What are your core beliefs regarding the role of technology in shaping society?" and "democratize opportunities" in cleaned_answer:
        print("  - Relevant: Accurate philosophy from document.")
    elif query == "How do you think cultural values should influence technological advancements?" and "respect diversity" in cleaned_answer:
        print("  - Relevant: Accurate cultural values from document.")
    elif query == "As a master’s student, what is the most challenging aspect of your studies so far?" and "market dynamics" in cleaned_answer:
        print("  - Relevant: Accurate challenge from document.")
    elif query == "What specific research interests or academic goals do you hope to achieve during your time as a master’s student?" and ("DeFi" in cleaned_answer or "cryptocurrency" in cleaned_answer):
        print("  - Relevant: Accurate research interests from document.")
    else:
        print("  - Issue: Response may include hallucinated or unrelated details not in the document.")

print("\nGenerator Summary:")
print("- The 'fastchat-t5-3b-v1.0' model generates answers based on retrieved chunks and its pre-trained knowledge.")
print("- Issue: It may hallucinate or over-elaborate (e.g., adding speculative details) when the retrieved context is vague or insufficient, especially for broad questions.")
print("=" * 60)

KeyboardInterrupt: 

## Chatbot

In [36]:
import re
import json

qa_pairs = []

def clean_answer(answer):
    answer = re.sub(r'<pad>', '', answer)
    answer = re.sub(r'pad>', '', answer)
    answer = re.sub(r'\s+', ' ', answer)
    answer = answer.strip()
    return answer


In [56]:
cleaned_answer = clean_answer(answer['answer'])
qa_pair = {
    "question": prompt_question,
    "answer": cleaned_answer 
}
qa_pairs.append(qa_pair)