In [1]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
import os

In [2]:
# Set the environment variable 'HF_HOME'.
# This variable specifies the directory where Hugging Face models,
# datasets, and other cached files will be stored.
# By setting it, you control where the model weights are downloaded and saved.
os.environ['HF_HOME'] = 'D:/huggingface_cache'

In [3]:
# Initialize the HuggingFacePipeline from a pre-trained model.
# This approach directly loads a model into a Hugging Face pipeline,
# often for local inference or when more fine-grained control over the pipeline
# parameters is needed.
# - 'model_id': Specifies the exact model to load from the Hugging Face Hub.
#              "TinyLlama/TinyLlama-1.1B-Chat-v1.0" is a compact chat model.
# - 'task': Defines the task the loaded model will perform, which is "text-generation".
# - 'pipeline_kwargs': A dictionary to pass additional arguments directly to the
#                      Hugging Face pipeline constructor.
#   - 'temperature=0.5': Controls the randomness of the generated text. Lower values
#                        (like 0.5) make the output more focused and deterministic.
#   - 'max_new_tokens=100': Sets the maximum number of new tokens (words/subwords)
#                           the model can generate in its response. This helps
#                           limit the length of the output.
llm = HuggingFacePipeline.from_model_id(
    model_id='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
    task='text-generation',
    pipeline_kwargs=dict(
        temperature=0.5,
        max_new_tokens=100
    )
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [4]:
# Wrap the HuggingFacePipeline model with ChatHuggingFace.
# ChatHuggingFace acts as an adapter, allowing the raw Hugging Face pipeline (llm)
# to be used seamlessly within LangChain's chat-oriented interfaces. This means
# you can send chat messages to it and receive responses.
model = ChatHuggingFace(llm=llm)

In [5]:
# Invoke the chat model with a specific question.
# The prompt is "What is the capital of France".
# The 'invoke' method sends this question to the model and waits for its answer.
result = model.invoke("What is the capital of France")

In [6]:
# Print the content of the model's response.
# For chat models, the actual textual response is typically found within
# the 'content' attribute of the returned message object.
print(result.content)

<|user|>
What is the capital of France</s>
<|assistant|>
The capital of France is Paris.
