In [None]:
# -*- coding: utf-8 -*-
"""LLM_Stopping_Criteria.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1MbsNP41fMrseoag5vEVn0LnfOJAwpay6
"""

In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq bitsandbytes==0.40.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.30.0 --progress-bar off
!pip install -qqq accelerate==0.21.0 --progress-bar off
!pip install -qqq xformers==0.0.20 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off
!pip install -qqq langchain==0.0.233 --progress-bar off

In [None]:
import re
import warnings
from typing import List

import torch
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
)

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
#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
from huggingface_hub import notebook_login

notebook_login()

In [None]:
MODEL_NAME = "falcon-7B-instruct-300steps-merged"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, load_in_8bit=True, device_map="auto"
)
model = model.eval()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
generation_config = model.generation_config
generation_config.temperature = 0
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 256
generation_config.use_cache = False
generation_config.repetition_penalty = 1.7
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

## Try the Model

In [None]:
prompt = """
The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.

Current conversation:

Human: Who is Dwight K Schrute?
AI:
""".strip()

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

## Stop the LLM From Rambling

In [None]:
class StopGenerationCriteria(StoppingCriteria):
    def __init__(
        self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device
    ):
        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids
        ]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

In [None]:
stop_tokens = [["Human", ":"], ["AI", ":"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, model.device)]
)

In [None]:
generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    stopping_criteria=stopping_criteria,
    generation_config=generation_config,
)

In [None]:
llm = HuggingFacePipeline(pipeline=generation_pipeline)

In [None]:
res = llm(prompt)
print(res)

## Conversation Chain

In [None]:
chain = ConversationChain(llm=llm)
print(chain.prompt.template)

## Custom Prompt

In [None]:
template = """
The following is a conversation between a human an AI.

Current conversation:
{history}
Human: {input}
AI:""".strip()

prompt = PromptTemplate(input_variables=["history", "input"], template=template)

memory = ConversationBufferWindowMemory(
    memory_key="history", k=6, return_only_outputs=True
)

chain = ConversationChain(llm=llm, memory=memory, prompt=prompt, verbose=True)

In [None]:
text = "how to create a molecule using rdkit"
res = chain.predict(input=text)
print(res)

## Cleaning Output

In [None]:
class CleanupOutputParser(BaseOutputParser):
    def parse(self, text: str) -> str:
        user_pattern = r"\nUser"
        text = re.sub(user_pattern, "", text)
        human_pattern = r"\nHuman:"
        text = re.sub(human_pattern, "", text)
        ai_pattern = r"\nAI:"
        return re.sub(ai_pattern, "", text).strip()

    @property
    def _type(self) -> str:
        return "output_parser"

In [None]:
memory = ConversationBufferWindowMemory(
    memory_key="history", k=6, return_only_outputs=True
)

chain = ConversationChain(
    llm=llm,
    memory=memory,
    prompt=prompt,
    output_parser=CleanupOutputParser(),
    verbose=True,
)

In [None]:
text = """
how to create a molecule using rdkit
""".strip()
res = chain(text)

In [None]:
res.keys()

In [None]:
print(res["response"])