In [33]:
import os
import asyncio
from tqdm import tqdm
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

import re
from langchain.chains import LLMChain
import torch


from typing import Any, Dict, List, Mapping, Optional

import requests

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.pydantic_v1 import Extra, root_validator
from langchain.utils import get_from_dict_or_env

VALID_TASKS = ("text2text-generation", "text-generation", "summarization")


class TransformersBatchInference(LLM):

    endpoint_url: str = ""
    """Endpoint URL to use."""

    model_kwargs: Optional[dict] = None
    """Key word arguments to pass to the model."""

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        _model_kwargs = self.model_kwargs or {}
        return {
            **{"endpoint_url": self.endpoint_url},
            **{"model_kwargs": _model_kwargs},
        }

    @property
    def _llm_type(self) -> str:
        """Return type of llm."""
        return "huggingface_endpoint"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Call out to HuggingFace Hub's inference endpoint.

        Args:
            prompt: The prompt to pass into the model.
            stop: Optional list of stop words to use when generating.

        Returns:
            The string generated by the model.

        Example:
            .. code-block:: python

                response = hf("Tell me a joke.")
        """
        _model_kwargs = self.model_kwargs or {}

        # payload samples
        params = {**_model_kwargs, **kwargs}
        parameter_payload = {"inputs": prompt, "parameters": params}

        # HTTP headers for authorization
        headers = {
            "Content-Type": "application/json",
        }

        try:
            response = requests.post(
                self.endpoint_url, headers=headers, json=parameter_payload
            )
        except requests.exceptions.RequestException as e:  # This is the correct syntax
            raise ValueError(f"Error raised by inference endpoint: {e}")
       
        generated_text = response.json()
        if "error" in generated_text:
            raise ValueError(
                f"Error raised by inference API: {generated_text['error']}"
            )
        
        text = generated_text[0]["generated_text"]
        if stop is not None:
            # This is a bit hacky, but I can't figure out a better way to enforce
            # stop tokens when making calls to huggingface_hub.
            text = enforce_stop_tokens(text, stop)
        return text


llm = TransformersBatchInference(endpoint_url="http://localhost:30091/v1/generation")

In [28]:
examples = ["What is concious thinking?",
     "How do you know if you are concious?",
     "What is reality?", 
     "When will the world end?",
     "Why is the sky blue?",
     "When is the next world war?",
     "What is a black hole?",
     "What is a quark?",
     "What is a photon?",
     "What is a gluon?"
     "Is there a god?",
     "What is the meaning of life?",
     "What is the meaning of death?",
     "What is the meaning of conciousness?",
     "What is the meaning of reality?",
     "What is the meaning of existence?",
     "What is the meaning of the universe?",
     "What is the meaning of the multiverse?",
     "When does the universe end?",
     "What is the universe expanding into?"]

In [30]:
responses = []

for i in tqdm(examples, total=len(examples)):
    responses.append(await llm.agenerate(["What is the purpose of life?"], 
        max_length = 300, 
        top_p = 0.95, 
        top_k = 50, 
        do_sample = True, 
        num_return_sequences = 1, 
        temperature = 0.4, 
        repetition_penalty = 1.2))

100%|██████████| 19/19 [02:25<00:00,  7.65s/it]


In [32]:
print(responses)

generations=[[Generation(text='What is the purpose of life?\n- How can I be a better person and make a positive impact on others?\n- Is there any way to transcend my limitations or overcome suffering?\n\nThese questions have been asked by people throughout history, across cultures and religions. While different traditions may offer unique answers, they all share a common goal: to help us understand our place in the world, find meaning and purpose, and ultimately achieve some form of enlightenment or liberation from suffering.')]] llm_output=None run=[RunInfo(run_id=UUID('85a23698-4747-474c-9e04-98e42aa42f98'))]


In [34]:
calls = []

for i in tqdm(examples, total=len(examples)):
    calls.append(llm.agenerate(["What is the purpose of life?"], 
        max_length = 300, 
        top_p = 0.95, 
        top_k = 50, 
        do_sample = True, 
        num_return_sequences = 1, 
        temperature = 0.4, 
        repetition_penalty = 1.2))

reponses_batch = await asyncio.gather(*calls)

100%|██████████| 19/19 [00:00<00:00, 196769.82it/s]
