In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
pip install transformers PyPDF2 langchain pypdf




In [11]:
pip install langchain pypdf tiktoken openai faiss-cpu



### Long Question Answer Generation

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os
import pandas as pd

# os.environ["OPENAI_API_KEY"] = "sk-"git

# Set file path
file_path = '/content/drive/MyDrive/Intellify/Big Mac Index.pdf'

# Load data from PDF
loader = PyPDFLoader(file_path)
data = loader.load()

question_gen = ''

for page in data:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 10000,
    chunk_overlap = 200
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

splitter_ans_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 1000,
    chunk_overlap = 100
)


document_answer_gen = splitter_ans_gen.split_documents(
    document_ques_gen
)

llm_ques_gen_pipeline = ChatOpenAI(
    temperature = 0.1,
    model = "gpt-3.5-turbo"
)

prompt_template = """
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the coders or programmers for their tests.
Make sure not to lose any important information.

QUESTIONS:
"""

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are an expert at creating practice questions based on coding material and documentation.
Your goal is to help a coder or programmer prepare for a coding test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)
#question generation
ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline,
                                          chain_type = "refine",
                                          verbose = True,
                                          question_prompt=PROMPT_QUESTIONS,
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

ques = ques_gen_chain.run(document_ques_gen)
print(ques)


embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(document_answer_gen, embeddings)
#generate answer
llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")

ques_list = ques.split("\n")

ques_list

answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen,
                                               chain_type="stuff",
                                               retriever=vector_store.as_retriever())




qa_pairs = []

# Answer each question and save to a file
for question in ques_list:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\n\n")
    # Append question-answer pair to the list
    qa_pairs.append({'Question': question, 'Answer': answer})

# Create a DataFrame from the list of question-answer pairs
df = pd.DataFrame(qa_pairs)

# Display the DataFrame
df



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
Big
Mac
Index
The
Big
Mac
Index
is
a
price
index
published
since
1986
by
The
Economist
as
an
informal
way
of
measuring
the
purchasing
power
parity
(PPP)
between
two
currencies
and
providing
a
test
of
the
extent
to
which
market
exchange
rates
result
in
goods
costing
the
same
in
different
countries.
It
"seeks
to
make
exchange-rate
theory
a
bit
more
digestible."
The
index
compares
the
relative
price
worldwide
to
purchase
the
Big
Mac,
a
hamburger
sold
at
McDonald's
restaurants.
Overview
The
Big
Mac
index
was
introduced
in
The
Economist
in
September
1986
by
Pam
Woodall
as
a
semi-humorous
illustration
of
PPP
and
has
been
publishe

Unnamed: 0,Question,Answer
0,1. What is the purpose of the Big Mac Index?,The purpose of the Big Mac Index is to calcula...
1,2. Who introduced the Big Mac Index in The Eco...,The Big Mac Index was introduced in The Econom...
2,3. How is the implied exchange rate calculated...,The implied exchange rate using the Big Mac In...
3,4. How can the Big Mac Index be used to analyz...,The Big Mac Index can be used to analyze a cur...
4,5. What are some limitations of the Big Mac In...,Some limitations of the Big Mac Index methodol...
5,6. What is the significance of the Big Mac Pay...,The Big Mac Pay Gap Index introduced by Trusai...
6,7. How did the government of Argentina manipul...,"The government of Argentina, specifically Secr..."
7,8. How do the nutritional values and ingredien...,The nutritional values and ingredients of Big ...
8,9. What factors can influence the price of a B...,"According to the text, factors that can influe..."
9,10. How does the presence of McDonald's franch...,The presence of McDonald's franchises impacts ...


In [6]:
excel_file_path = "/content/drive/MyDrive/Intellify/question-answers.xlsx"

# Convert DataFrame to Excel and save
df.to_excel(excel_file_path, index=False)

print("DataFrame successfully saved to Excel file:", excel_file_path)


DataFrame successfully saved to Excel file: /content/drive/MyDrive/Intellify/question-answers.xlsx


In [None]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


### One Word Answer

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os
import pandas as pd

os.environ["OPENAI_API_KEY"] = "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"

# Set file path
file_path = '/content/drive/MyDrive/Intellify/Big Mac Index.pdf'

# Load data from PDF
loader = PyPDFLoader(file_path)
data = loader.load()

question_gen = ''

for page in data:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 1000,
    chunk_overlap = 200
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

splitter_ans_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 1000,
    chunk_overlap = 100
)


document_answer_gen = splitter_ans_gen.split_documents(
    document_ques_gen
)

llm_ques_gen_pipeline = ChatOpenAI(
    temperature = 0.1,
    model = "gpt-3.5-turbo"
)

prompt_template = ("""
You are creating questions to test knowledge on the content below:

------------
{text}
------------

Generate questions with one-word answers that assess understanding of the material.

QUESTIONS:
"""
)
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are refining questions based on the provided context:

------------
{text}
------------

Refine the questions to elicit one-word answers.

QUESTIONS:
"""
)
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)
#question generation
ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline,
                                          chain_type = "refine",
                                          verbose = True,
                                          question_prompt=PROMPT_QUESTIONS,
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

ques = ques_gen_chain.run(document_ques_gen)
print(ques)


embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(document_answer_gen, embeddings)
#generate answer
llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")

ques_list = ques.split("\n")

ques_list

answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen,
                                               chain_type="stuff",
                                               retriever=vector_store.as_retriever())




qa_pairs = []

# Answer each question and save to a file
for question in ques_list:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\n\n")
    # Append question-answer pair to the list
    qa_pairs.append({'Question': question, 'Answer': answer})

# Create a DataFrame from the list of question-answer pairs
df = pd.DataFrame(qa_pairs)

# Display the DataFrame
df



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are creating questions to test knowledge on the content below:

------------
Big
Mac
Index
The
Big
Mac
Index
is
a
price
index
published
since
1986
by
The
Economist
as
an
informal
way
of
measuring
the
purchasing
power
parity
(PPP)
between
two
currencies
and
providing
a
test
of
the
extent
to
which
market
exchange
rates
result
in
goods
costing
the
same
in
different
countries.
It
"seeks
to
make
exchange-rate
theory
a
bit
more
digestible."
The
index
compares
the
relative
price
worldwide
to
purchase
the
Big
Mac,
a
hamburger
sold
at
McDonald's
restaurants.
Overview
The
Big
Mac
index
was
introduced
in
The
Economist
in
September
1986
by
Pam
Woodall
as
a
semi-humorous
illustration
of
PPP
and
has
been
published
by
that
paper
annually
since
then.
Although
the
Big
Mac
Index
was
not
intended
to
be
a
legitimate
tool
for
exchange
rate
evaluation,
it
is
now
global

Unnamed: 0,Question,Answer
0,1. Which country had the most expensive Big Ma...,Switzerland had the most expensive Big Mac in ...
1,2. Which country had the cheapest Big Mac in J...,"In July 2023, Taiwan had the cheapest Big Mac ..."
2,3. In which city was the average working time ...,"In July 2015, the city with the fastest averag..."
3,4. In which city was the average working time ...,"In July 2015, the city where the average worki..."


In [13]:
# Define the file path for the Excel file
excel_file_path = "/content/drive/MyDrive/Intellify/one_word_answer.xlsx"

# Convert DataFrame to Excel and save
df.to_excel(excel_file_path, index=False)

print("DataFrame successfully saved to Excel file:", excel_file_path)


DataFrame successfully saved to Excel file: /content/drive/MyDrive/Intellify/one_word_answer.xlsx


### True/False

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os
import pandas as pd

os.environ["OPENAI_API_KEY"] = "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"

# Set file path
file_path = '/content/drive/MyDrive/Intellify/Big Mac Index.pdf'

# Load data from PDF
loader = PyPDFLoader(file_path)
data = loader.load()

question_gen = ''

for page in data:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 100,
    chunk_overlap = 20
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

splitter_ans_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 100,
    chunk_overlap = 10
)


document_answer_gen = splitter_ans_gen.split_documents(
    document_ques_gen
)

llm_ques_gen_pipeline = ChatOpenAI(
    temperature = 0.1,
    model = "gpt-3.5-turbo"
)

prompt_template = ("""
You are creating True/False questions to test knowledge on the content below:

------------
{text}
------------

State whether the following statements are True or False:

1. True: [Statement 1]
2. True: [Statement 2]
3. True: [Statement 3]
...
N. True: [Statement N]

1. False: [Statement 1]
2. False: [Statement 2]
3. False: [Statement 3]
...
N. False: [Statement N]

Respond with "True" or "False" for each statement.

"""
)
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are refining True/False questions based on the provided context:

------------
{text}
------------

Refine the statements to elicit True or False responses.

STATEMENTS:

1. [Statement 1]: [True/False]
2. [Statement 2]: [True/False]
3. [Statement 3]: [True/False]
...
N. [Statement N]: [True/False]

"""
)
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)
#question generation
ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline,
                                          chain_type = "refine",
                                          verbose = True,
                                          question_prompt=PROMPT_QUESTIONS,
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

ques = ques_gen_chain.run(document_ques_gen)
print(ques)


embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(document_answer_gen, embeddings)
#generate answer
llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")

ques_list = ques.split("\n")

ques_list

answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen,
                                               chain_type="stuff",
                                               retriever=vector_store.as_retriever())




qa_pairs = []

# Answer each question and save to a file
for question in ques_list:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\n\n")
    # Append question-answer pair to the list
    qa_pairs.append({'Question': question, 'Answer': answer})

# Create a DataFrame from the list of question-answer pairs
df = pd.DataFrame(qa_pairs)

# Display the DataFrame
df



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are creating True/False questions to test knowledge on the content below:

------------
Big
Mac
Index
The
Big
Mac
Index
is
a
price
index
published
since
1986
by
The
Economist
as
an
informal
way
of
measuring
the
purchasing
power
parity
(PPP)
between
two
currencies
and
providing
a
test
of
the
extent
to
which
market
exchange
rates
result
in
------------

State whether the following statements are True or False:

1. True: [Statement 1]
2. True: [Statement 2]
3. True: [Statement 3]
...
N. True: [Statement N]

1. False: [Statement 1]
2. False: [Statement 2]
3. False: [Statement 3]
...
N. False: [Statement N]

Respond with "True" or "False" for each statement.

[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are refining True/False questions based on the provided context:

------------

Unnamed: 0,Question,Answer
0,1. Mexico City has an average commute time of ...,True.
1,2. Jakarta's average commute time is less than...,True. Jakarta's average commute time is 66.7 m...
2,3. Cairo's average commute time is 62.5 minute...,False. The average working time required to bu...
3,4. Kyiv has a shorter average commute time tha...,True. Kyiv has an average commute time of 54.7...


In [18]:
# Define the file path for the Excel file
excel_file_path = "/content/drive/MyDrive/Intellify/True-False_question_answer.xlsx"

# Convert DataFrame to Excel and save
df.to_excel(excel_file_path, index=False)

print("DataFrame successfully saved to Excel file:", excel_file_path)


DataFrame successfully saved to Excel file: /content/drive/MyDrive/Intellify/True-False_question_answer.xlsx


### MCQ Question Answer Generation

In [27]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os
import pandas as pd

os.environ["OPENAI_API_KEY"] = "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"

# Set file path
file_path = '/content/drive/MyDrive/Intellify/Big Mac Index.pdf'

# Load data from PDF
loader = PyPDFLoader(file_path)
data = loader.load()

question_gen = ''

for page in data:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 100,
    chunk_overlap = 20
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

splitter_ans_gen = TokenTextSplitter(
    model_name = 'gpt-3.5-turbo',
    chunk_size = 100,
    chunk_overlap = 10
)


document_answer_gen = splitter_ans_gen.split_documents(
    document_ques_gen
)

llm_ques_gen_pipeline = ChatOpenAI(
    temperature = 0.1,
    model = "gpt-3.5-turbo"
)

prompt_template = ("""
You are creating Multiple Choice Questions (MCQs) to test knowledge on the content below:

------------
{text}
------------

Choose the correct answer for each question:

1. [Question 1]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

2. [Question 2]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

...
N. [Question N]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

Provide the correct answer (e.g., "1b", "2c", etc.) along with the corresponding letter for each question.

"""
)
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are refining Multiple Choice Questions (MCQs) based on the provided context:

------------
{text}
------------

Refine the questions and answer choices, ensuring that one option is correct for each question.

QUESTIONS:

1. [Question 1]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

2. [Question 2]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

...
N. [Question N]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

"""

)
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)
#question generation
ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline,
                                          chain_type = "refine",
                                          verbose = True,
                                          question_prompt=PROMPT_QUESTIONS,
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

ques = ques_gen_chain.run(document_ques_gen)
print(ques)


embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(document_answer_gen, embeddings)
#generate answer
llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")

ques_list = ques.split("\n")

ques_list

answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen,
                                               chain_type="stuff",
                                               retriever=vector_store.as_retriever())




qa_pairs = []

# Answer each question and save to a file
for question in ques_list:
    print("Question and Answer: ", question)
    answer = answer_generation_chain.run(question)
    # Append question-answer pair to the list
    qa_pairs.append({'Question and Answer': question, 'Answer': answer})

# Create a DataFrame from the list of question-answer pairs
df = pd.DataFrame(qa_pairs)

# Display the DataFrame
df

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
goods
costing
the
same
in
different
countries.
It
"seeks
to
make
exchange-rate
theory
a
bit
more
digestible."
The
index
compares
the
relative
price
worldwide
to
purchase
the
Big
Mac,
a
hamburger
sold
at
McDonald's
restaurants
------------

Refine the questions and answer choices, ensuring that one option is correct for each question.

QUESTIONS:

1. [Question 1]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

2. [Question 2]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

...
N. [Question N]
   a) Option A
   b) Option B
   c) Option C
   d) Option D

   Correct Answer: [Correct Option]

[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are refining Multiple Choice Questions (MCQs) based on the provided context:

------------
the
Big
Mac,
a
hamburger
sold
a

Unnamed: 0,Question,Answer
0,1. What is the average commute time in Mexico ...,The average commute time in Mexico City is 78....
1,a) 78.4 min,"In Mexico City, it took an average of 78.4 min..."
2,b) 66.7 min,The average working time required to buy one B...
3,c) 62.5 min,"I'm sorry, but your question seems to be incom..."
4,d) 54.7 min,The average working time required to buy one B...
5,,"I'm sorry, but it seems like your message is i..."
6,Correct Answer: a) 78.4 min,The average working time required to buy one B...
7,,"I'm sorry, but I don't have enough context to ..."
8,2. Which city has the shortest average commute...,The city with the shortest average commute tim...
9,a) Mexico City,The average time it takes to earn a Big Mac in...


In [28]:
# Define the file path for the Excel file
excel_file_path = "/content/drive/MyDrive/Intellify/MCQ_question_answer.xlsx"

# Convert DataFrame to Excel and save
df.to_excel(excel_file_path, index=False)

print("DataFrame successfully saved to Excel file:", excel_file_path)


DataFrame successfully saved to Excel file: /content/drive/MyDrive/Intellify/MCQ_question_answer.xlsx
