In [1]:
# 🚀 Load the Smallest LLaMA 3 Model & Create a Chatbot

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [10]:
# ✅ Device Selection (MPS for Mac, CUDA for NVIDIA, fallback to CPU)
device: torch.device = torch.device(
    "mps" if torch.backends.mps.is_available() else 
    "cuda" if torch.cuda.is_available() else 
    "cpu"
)
print(f"Using device: {device}")

Using device: cpu


In [11]:
# ✅ Set up model name
model_name = "deepseek-ai/DeepSeek-R1" 

In [14]:
# ✅ Load DeepSeek Model & Tokenizer
model_name: str = "deepseek-ai/deepseek-coder-1.3b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)

In [15]:
# Perform Inference
def chat_with_deepseek(input_text: str, max_length: int = 50) -> tuple[str, int]:
    """
    Generates text using the DeepSeek model.

    Args:
        input_text (str): User input query.
        max_length (int): Maximum length of generated text.

    Returns:
        tuple[str, int]: Generated text and word count.
    """
    # ✅ Tokenize input and move to device
    input_ids: torch.Tensor = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # ✅ Generate response
    with torch.no_grad():
        output: torch.Tensor = model.generate(input_ids, max_length=max_length, num_return_sequences=1)

    # ✅ Decode output text
    output_text: str = tokenizer.decode(output[0], skip_special_tokens=True)

    # ✅ Count words
    word_count: int = len(output_text.split())

    return output_text, word_count

In [None]:
# ✅ Example usage
input_text: str = "What is the capital of Japan?"
generated_text, word_count = chat_with_deepseek(input_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
