In [1]:
!pip install -qU langchain tiktoken einops langchain-openai wikipedia instructor

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.0/817.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.6 MB/s

In [2]:
import os
import json

from typing import List, Dict
from pprint import pprint

import torch
import transformers
import instructor

import numpy as np
from transformers import pipeline, AutoTokenizer
from pydantic import BaseModel as PydanticBaseModel, validator, root_validator
from transformers import StoppingCriteria, StoppingCriteriaList
from torch import cuda, bfloat16
from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.schema import (
    HumanMessage,
    SystemMessage,
)
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.chat_models.huggingface import ChatHuggingFace
from langchain_community.llms import HuggingFaceHub
from langchain.output_parsers import PydanticOutputParser, StructuredOutputParser
from langchain.schema.output_parser import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough


!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [4]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] =
os.environ['HUGGINGFACEHUB_API_TOKEN'] =

In [5]:
# the model and tokenizer for the generation
LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.2"
LLM_TOKENIZER="mistralai/Mistral-7B-Instruct-v0.2"

# LOAD WIKI DATA

In [6]:
# Load Document
loader = WikipediaLoader(query= "Aristophanes")

loader.requests_kwargs = {"verify": False}

document = loader.load()
pprint(document[0].page_content[:200])



  lis = BeautifulSoup(html).find_all('li')


('Aristophanes (; Ancient Greek: Ἀριστοφάνης, pronounced [aristopʰánɛːs]; '
 'c.\u2009446 – c.\u2009386 BC), son of Philippus and Zenodora, of the deme '
 'Kydathenaion (Latin: Cydathenaeum), was a comic playwright or co')


In [7]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = text_splitter.split_documents(document)

# SETUP GENERATOR AGENTS

## IF WE USE A MODEL LOCALLY

In [None]:
# set device to GPU if available
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# load the model
model = transformers.AutoModelForCausalLM.from_pretrained(LLM_MODEL, trust_remote_code=True, torch_dtype=bfloat16)
# set it to evaluation mode
model.eval()
# assign it to the available device
model.to(device)

# load the tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(LLM_MODEL)

# mistral is trained to add "</s>" at the end of generations
stop_token_ids = tokenizer.convert_tokens_to_ids(["</s>"])

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
    for stop_id in stop_token_ids:
      if input_ids[0][-1] == stop_id:
        return True
    return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True, # langchain expects the full text
    task='text-generation', device=device, # we pass model parameters here too
    stopping_criteria=stopping_criteria, # without this model will ramble
    temperature=0.1, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p=0.15, # select from top tokens whose probability add up to 15%
    top_k=0, # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens=512, # max number of tokens to generate in the output
    repetition_penalty=1.1, # without this output begins repeating
    do_sample=True
    )

prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
llm = HuggingFacePipeline(pipeline=generate_text)
chat_model = ChatHuggingFace(llm=llm)

## IF WE USE A MODEL FROM THE HUB

In [8]:
llm = HuggingFaceHub(
    repo_id=LLM_MODEL,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 768,
        "top_p": 0.15,
        "temperature": 0.1,
        "repetition_penalty": 1.1,
    },
)

chat_model = ChatHuggingFace(llm=llm)

  warn_deprecated(
  warn_deprecated(
                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

## RAG CHAIN FOR CONTEXT-QUESTIONS GENERATION

In [9]:
class QA(BaseModel):
    """Generate 4 questions based on the given context about Aristophanes."""

    question1: str = Field(description="Question 1")
    question2: str = Field(description="Question 2")
    question3: str = Field(description="Question 3")
    question4: str = Field(description="Question 4")

pprint(json.dumps(convert_to_openai_tool(QA), indent=2))

('{\n'
 '  "type": "function",\n'
 '  "function": {\n'
 '    "name": "QA",\n'
 '    "description": "Generate 4 questions based on the given context about '
 'Aristophanes.",\n'
 '    "parameters": {\n'
 '      "type": "object",\n'
 '      "properties": {\n'
 '        "question1": {\n'
 '          "description": "Question 1",\n'
 '          "type": "string"\n'
 '        },\n'
 '        "question2": {\n'
 '          "description": "Question 2",\n'
 '          "type": "string"\n'
 '        },\n'
 '        "question3": {\n'
 '          "description": "Question 3",\n'
 '          "type": "string"\n'
 '        },\n'
 '        "question4": {\n'
 '          "description": "Question 4",\n'
 '          "type": "string"\n'
 '        }\n'
 '      },\n'
 '      "required": [\n'
 '        "question1",\n'
 '        "question2",\n'
 '        "question3",\n'
 '        "question4"\n'
 '      ]\n'
 '    }\n'
 '  }\n'
 '}')


In [10]:
pydantic_parser = PydanticOutputParser(pydantic_object=QA)
format_instructions = pydantic_parser.get_format_instructions()
pprint(format_instructions)

('The output should be formatted as a JSON instance that conforms to the JSON '
 'schema below.\n'
 '\n'
 'As an example, for the schema {"properties": {"foo": {"title": "Foo", '
 '"description": "a list of strings", "type": "array", "items": {"type": '
 '"string"}}}, "required": ["foo"]}\n'
 'the object {"foo": ["bar", "baz"]} is a well-formatted instance of the '
 'schema. The object {"properties": {"foo": ["bar", "baz"]}} is not '
 'well-formatted.\n'
 '\n'
 'Here is the output schema:\n'
 '```\n'
 '{"description": "Generate 4 questions based on the given context about '
 'Aristophanes.", "properties": {"question1": {"title": "Question1", '
 '"description": "Question 1", "type": "string"}, "question2": {"title": '
 '"Question2", "description": "Question 2", "type": "string"}, "question3": '
 '{"title": "Question3", "description": "Question 3", "type": "string"}, '
 '"question4": {"title": "Question4", "description": "Question 4", "type": '
 '"string"}}, "required": ["question1", "qu

In [11]:
synth_template = """You are an expert in ancient greek comedy and Aristophanes.
\n{format_instructions}
\nOnly provide the final JSON formatted information.
\nContext: {context}
"""

synth_prompt = PromptTemplate(
    template=synth_template,
    input_variables=["context"],
    partial_variables={"format_instructions": pydantic_parser.get_format_instructions()},
)

In [12]:
def extract_json_str(res: str) -> str:
  if "```" in res:
    res = res.partition("```")[2]
    if "json\n" in res:
      res = res.partition("json\n")[2]
      if "\n```" in res:
        res = res.partition("\n```")[0]
  return res

In [13]:
# chain
qa_generator = (
    {"context": RunnablePassthrough(verbose=True)}
    | synth_prompt
    | chat_model
    | StrOutputParser()
    | extract_json_str
)

In [14]:
class QA(PydanticBaseModel):
  """Generate questions based on the given context."""

  context: str
  question1: str
  question2: str
  question3: str
  question4: str

  @validator("*")
  def ensure_valid_question(cls, value):
      if value.lower().startswith('question'):
        raise ValueError("Not a valid question")
      if value.lower().startswith('answer'):
        raise ValueError("Not a valid question")
      if len(value) == 0:
        raise ValueError("Not a valid question")
      return value

<ipython-input-14-07270187b2aa>:10: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator("*")


In [None]:
res = []
non_parsed_res = []
for i in splits[:100]:
  if len(i.page_content) > 100:
    question_set = qa_generator.invoke(i.page_content)
    try:
      questions = eval(question_set)
      res.append(
          QA(context=i.page_content,
            question1=questions['properties']['question1']['description'],
            question2=questions['properties']['question2']['description'],
            question3=questions['properties']['question3']['description'],
            question4=questions['properties']['question4']['description'])
          )
    except Exception as e:
      non_parsed_res.append((question_set, e))

In [16]:
len(res)

53

In [17]:
with open("./question_context.json", "w") as file:
  json.dump([r.dict() for r in res], file)

## RAG CHAIN FOR QUESTION-ANSWER GENERATION

In [18]:
class QA(BaseModel):
    """Generate a question and its answer based on the given context about Aristophanes."""

    question: str = Field(description="Question")
    answer: str = Field(description="Answer")

pprint(json.dumps(convert_to_openai_tool(QA), indent=2))

('{\n'
 '  "type": "function",\n'
 '  "function": {\n'
 '    "name": "QA",\n'
 '    "description": "Generate a question and its answer based on the given '
 'context about Aristophanes.",\n'
 '    "parameters": {\n'
 '      "type": "object",\n'
 '      "properties": {\n'
 '        "question": {\n'
 '          "description": "Question",\n'
 '          "type": "string"\n'
 '        },\n'
 '        "answer": {\n'
 '          "description": "Answer",\n'
 '          "type": "string"\n'
 '        }\n'
 '      },\n'
 '      "required": [\n'
 '        "question",\n'
 '        "answer"\n'
 '      ]\n'
 '    }\n'
 '  }\n'
 '}')


In [19]:
pydantic_parser = PydanticOutputParser(pydantic_object=QA)
format_instructions = pydantic_parser.get_format_instructions()
pprint(format_instructions)

('The output should be formatted as a JSON instance that conforms to the JSON '
 'schema below.\n'
 '\n'
 'As an example, for the schema {"properties": {"foo": {"title": "Foo", '
 '"description": "a list of strings", "type": "array", "items": {"type": '
 '"string"}}}, "required": ["foo"]}\n'
 'the object {"foo": ["bar", "baz"]} is a well-formatted instance of the '
 'schema. The object {"properties": {"foo": ["bar", "baz"]}} is not '
 'well-formatted.\n'
 '\n'
 'Here is the output schema:\n'
 '```\n'
 '{"description": "Generate a question and its answer based on the given '
 'context about Aristophanes.", "properties": {"question": {"title": '
 '"Question", "description": "Question", "type": "string"}, "answer": '
 '{"title": "Answer", "description": "Answer", "type": "string"}}, "required": '
 '["question", "answer"]}\n'
 '```')


In [20]:
synth_template = """You are an expert in ancient greek comedy and Aristophanes.
Generate a question and its answer based on the given context.
\n{format_instructions}
\nContext: {context}
"""
# \nOnly provide the final JSON formatted information.

synth_prompt = PromptTemplate(
    template=synth_template,
    input_variables=["context"],
    partial_variables={"format_instructions": pydantic_parser.get_format_instructions()},
)

In [21]:
def extract_json_str(res: str) -> str:
  if "[/INST] " in res:
    res = res.partition("[/INST] ")[2]
  elif "[/INST]" in res:
    res = res.partition("[/INST]")[2]
  return res

In [22]:
# chain
qa_generator = (
    {"context": RunnablePassthrough(verbose=True)}
    | synth_prompt
    | chat_model
    | StrOutputParser()
    | extract_json_str
)

In [23]:
class QA(PydanticBaseModel):
  """Generate questions based on the given context."""

  context: str
  question: str
  answer: str

  @validator("*")
  def ensure_valid_question(cls, value):
      if value.lower().startswith('question'):
        raise ValueError("Not a valid question - answer pair")
      if value.lower().startswith('answer'):
        raise ValueError("Not a valid question - answer pair")
      if len(value) == 0:
        raise ValueError("Not a valid question - answer pair")
      return value

<ipython-input-23-2adc5365cd55>:8: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator("*")


In [24]:
res = []
non_parsed_res = []
for i in splits[:200]:
  if len(i.page_content) > 100:
    question_set = qa_generator.invoke(i.page_content)
    try:
      question_set = eval(question_set)
      question_set["context"] = i.page_content
      res.append(
        QA(**question_set)
        )
    except Exception as e:
      non_parsed_res.append((question_set, e))

In [25]:
len(res)

105

In [27]:
with open("./question_answer_pairs.json", "w") as file:
  json.dump([r.dict() for r in res], file)