### Custom Chat Model can be built using <b><u>BaseChatModel interface</u></b>
#### We will need to implement the following properties or methods.
* _generate (method) (required) : Use to generate a chat result from a prompt
* _llm_type (property) (required) : Used to uniquely identify the type of the model. <b>Used for logging</b>
* _stream, _agenerate, _astream (method) (optional) 

#### A custom chat model that echoes the first `n` characters of the input.

In [39]:
from typing import Any, Iterator, List, Optional, Dict
from langchain_core.callbacks import CallbackManagerForLLMRun
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import BaseMessage, HumanMessage, AIMessageChunk, AIMessage
from langchain_core.outputs import ChatGeneration, ChatResult, ChatGenerationChunk

"""
_generate method returns ChatResult
ChatResult is a list of ChatGeneration
ChatGeneration returns the message(AIMessage)

"""

class CustomModel(BaseChatModel):

    """ A custom chat model that echoes the first `n` characters of the input. """
    
    model_name: str
    """ Name of the model """

    n : int
    """ The number of characters from the last message of the prompt to be echoed. """

    def _generate(self, 
                  messages: List[BaseMessage], 
                  stop: Optional[List[str]] = None, 
                  run_manager: Optional[CallbackManagerForLLMRun | None] = None, 
                  **kwargs: Any) -> ChatResult:
        """Override the _generate method to implement the chat model logic.

        This can be a call to an API, a call to a local model, or any other
        implementation that generates a response to the input prompt.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM. """
        
        """ This is the default return statement but we need to update it to return n characters from message"""
        #return super()._generate(messages, stop, run_manager, **kwargs)

        last_message = messages[-1]
        tokens = last_message.content[:self.n]
        message = AIMessage(
            content=tokens,
            additional_kwargs={},  # Used to add additional payload (e.g., function calling request)
            response_metadata={  # Use for response metadata
                "time_in_seconds": 3,
            },
        )

        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])
    
    @property
    def _llm_type(self):
        return "echoing-chat-model"
    
    def _stream(self, 
                messages: List[BaseMessage], 
                stop: Optional[List[str]] | None = None, 
                run_manager: Optional[CallbackManagerForLLMRun] | None = None, 
                **kwargs: Any) -> Iterator[ChatGenerationChunk]:
        """ This is the default return statement but we need to update it to return n characters from message"""
        # return super()._stream(messages, stop, run_manager, **kwargs)

        last_message = messages[-1]
        tokens = last_message.content[:self.n]

        for token in tokens:
            chunk = ChatGenerationChunk(message=AIMessageChunk(content=token))

            if run_manager:
                # This is optional in newer versions of LangChain
                # The on_llm_new_token will be called automatically
                run_manager.on_llm_new_token(token, chunk=chunk)

            yield chunk

        # Let's add some other information (e.g., response metadata)
        chunk = ChatGenerationChunk(
            message=AIMessageChunk(content="", response_metadata={"time_in_sec": 3})
        )
        if run_manager:
            # This is optional in newer versions of LangChain
            # The on_llm_new_token will be called automatically
            run_manager.on_llm_new_token(token, chunk=chunk)
        yield chunk

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters.

        This information is used by the LangChain callback system, which
        is used for tracing purposes make it possible to monitor LLMs.
        """
        return {
            # The model name allows users to specify custom token counting
            # rules in LLM monitoring applications (e.g., in LangSmith users
            # can provide per token pricing for their model and monitor
            # costs for the given LLM.)
            "model_name": self.model_name,
        }        


In [35]:
llm = CustomModel(n=3,model_name="test")
llm.invoke(
    [
        HumanMessage(content="hello!"),
        AIMessage(content="Hi there human!"),
        HumanMessage(content="Meow!"),
    ]
)


# llm.generate("Hello")
# llm.invoke([("human","Hello")])

AIMessage(content='Meo', response_metadata={'time_in_seconds': 3}, id='run-ed236f67-ba09-472a-8488-8ce7759400f0-0')

In [38]:
for chunk in llm.stream("Hello"):
    print(chunk.content, end="|")

H|e|l||