<a href="https://colab.research.google.com/github/Sidharth-2592/LLM-TinyLlAMA-/blob/main/LLM_for_Q%26A_(TinyLlama).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Install necessary packages
!pip install -q transformers accelerate bitsandbytes sentencepiece


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline



In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def setup_llama3_model():
    """
    Load an open-source language model that's available without authentication
    """
    # Use TinyLlama, which is open-source and doesn't require authentication
    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    # Display loading message
    print(f"Loading {model_id} model (this may take a minute)...")

    # Load tokenizer without requiring token
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Load model with 8-bit quantization to reduce memory usage
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True
    )

    return model, tokenizer

def create_qa_system(model, tokenizer):
    """
    Create a question-answering pipeline using the loaded model
    """
    # Create Q&A pipeline
    qa_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

    return qa_pipeline

def format_prompt(question):
    """
    Format the prompt for TinyLlama chat model
    """
    prompt = f"<|user|>\n{question}\n<|assistant|>"
    return prompt

def answer_question(qa_pipeline, question):
    """
    Generate an answer to the given question
    """
    prompt = format_prompt(question)

    # Generate response
    response = qa_pipeline(prompt)[0]['generated_text']

    # Extract only the model's answer from the response
    answer = response.split("<|assistant|>")[1].strip()

    return answer

# Alternative: If you want to use your own Hugging Face token
def setup_with_token(token):
    """
    Alternative setup function that uses a provided token
    """
    from huggingface_hub import login
    login(token)

    # Now you can use Meta-Llama-3 models
    model_id = "meta-llama/Meta-Llama-3-8B"

    print(f"Loading {model_id} model (this may take a minute)...")

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        load_in_8bit=True
    )

    return model, tokenizer


Using device: cuda


In [None]:

# Main execution
if __name__ == "__main__":
    # Option 1: Use TinyLlama without authentication
    model, tokenizer = setup_llama3_model()

    # Option 2: If you have a token, uncomment below and replace YOUR_TOKEN
    # model, tokenizer = setup_with_token("YOUR_TOKEN")

    # Create the QA system
    qa_system = create_qa_system(model, tokenizer)

    # Interactive Q&A loop
    print("\n===== LLM Q&A System =====")
    print("Type 'exit' to quit the program")

    while True:
        question = input("\nYour question: ")
        if question.lower() == 'exit':
            break

        print("\nThinking...")
        answer = answer_question(qa_system, question)
        print(f"\nAnswer: {answer}")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading TinyLlama/TinyLlama-1.1B-Chat-v1.0 model (this may take a minute)...


Device set to use cuda:0



===== LLM Q&A System =====
Type 'exit' to quit the program

Your question: Who is Narendra Modi?

Thinking...

Answer: Narendra Modi is the current Prime Minister of India. He was sworn in as the 14th Prime Minister of India on 26 May 2014, after winning the 2014 Indian general election. Modi is a Bharatiya Janata Party (BJP) leader, and he was the chief minister of Gujarat from 2001 to 2014. He is known for his strong leadership, economic reforms, and focus on improving governance and infrastructure development.
