In [None]:
!pip install transformers accelerate sentencepiece

In [7]:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


model_name = "microsoft/phi-1_5"

print("Loading model...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=120,
    device_map="auto",
    return_full_text=False,
    eos_token_id=tokenizer.eos_token_id
)


def ask_ai(q):
    prompt = (
        "Explain the following clearly in 4–5 sentences. "
        "Do NOT continue text patterns like Q&A, stories, chapters, books, or examples.\n\n"
        f"{q}\n\n"
        "Explanation:"
    )

    output = pipe(prompt)[0]["generated_text"]
    final = output.split("Explanation:", 1)[-1].strip()
    return final

print("Model loaded. Ask your questions.\n")

while True:
    q = input("Ask Anything (or exit): ")
    if q.lower() == "exit":
        break

    answer = ask_ai(q)
    print("\nAI:", answer, "\n")

''' Since, "microsoft/phi-1_5" is a not a chat model it is not suitable for QA and chat type output. so, it is drifting very much.'''

Loading model...


Device set to use cuda:0


Model loaded. Ask your questions.

Ask Anything (or exit): what is self attention mechanism in transformers architecture model?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



AI: Self attention mechanism in transformers architecture model is a type of attention mechanism where the model learns to attend to different parts of the input sequence and pay attention to each part independently by learning a weighting distribution for each token. This weighting distribution is then used to adjust the relative importance of each token when computing the output of the model, resulting in a more focused and selective attention to different parts of the input sequence.

Python Code Example:

```python
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads 

Ask Anything (or exit): exit


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("Loading model...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.3,
    return_full_text=False,
    device_map="auto"
)

def ask_ai(q):
    # Chat template internally handles system/user roles
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": q}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    result = pipe(prompt)[0]["generated_text"]
    return result.strip()

print("Model loaded. Ask your questions.\n")

while True:
    q = input("Ask Anything (or exit): ")
    if q.lower() == "exit":
        break

    answer = ask_ai(q)
    print("\nAI:", answer, "\n")


Loading model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded. Ask your questions.

Ask Anything (or exit): what is self attention mechanism in transformers architecture model?

AI: Self-Attention is a key component of the Transformer architecture model, which is a type of neural network designed for natural language processing (NLP). In the Transformer architecture, each layer of the network is composed of multiple self-attention blocks, which are responsible for computing the attention weights between each input token and its corresponding attention head.

The self-attention mechanism works by taking a sequence of input tokens as input and computing the attention weights between each token and its corresponding attention head. This attention weight is calculated as the product of the input token's position in the sequence and the cosine similarity between the input token's embedding and the query embedding.

The attention weights are then used to compute the output of the layer, which is the output of the self-attention block. The 

In [10]:
!pip install langchain_google_genai

Collecting langchain_google_genai
  Downloading langchain_google_genai-4.0.0-py3-none-any.whl.metadata (2.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading langchain_google_genai-4.0.0-py3-none-any.whl (63 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype, langchain_google_genai
Successfully installed filetype-1.2.0 langchain_google_genai-4.0.0


In [11]:
import os
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
# GOOGLE_API_KEY = "AIzaSyBUYyc9L-Ml1kzWT_np3Tl8PgZEBmTpB6Q"
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# 1. Create the LLM object
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",  # fast, cheap, good for learning :contentReference[oaicite:1]{index=1}
    temperature=0.5           # higher = more creative, lower = more deterministic
)

def ask_ai(q):
    response = llm.invoke(q)
    return f'response:\n {response.content}'

print("Model loaded. Ask your questions.\n")

while True:
    q = input("Ask Anything (or exit): ")
    if q.lower() == "exit":
        break

    answer = ask_ai(q)
    print("\nAI:", answer, "\n")

Model loaded. Ask your questions.

Ask Anything (or exit): what is the use of self attention in transformers architecture?

AI: response: Self-attention is the **cornerstone** of the Transformer architecture, and its primary use is to allow the model to **weigh the importance of different words (or tokens) in the input sequence relative to a given word** when processing that word.

Here's a breakdown of its key uses and benefits:

1.  **Contextual Understanding (The Primary Use):**
    *   **Problem:** Words often have different meanings depending on their context. For example, "bank" can mean a financial institution or the side of a river. Traditional models (like simple RNNs) struggled to capture these dynamic contextual meanings because they processed words sequentially, largely based on immediate neighbors.
    *   **Self-Attention's Solution:** When the model processes a word, say "bank" in the sentence "I walked to the **bank** of the river," self-attention allows it to look at *

In [12]:
! pip install ctransformers

Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ctransformers
Successfully installed ctransformers-0.2.27


In [33]:
import os
from ctransformers import AutoModelForCausalLM

os.environ["ANONYMIZED_TELEMETRY"] = "False"

print("Loading model...")

llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
    model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
    context_length=512,
    gpu_layers=0,
    model_type="llama",
    max_new_tokens=200,
    temperature=0.7
)

print("Model loaded!\n")

def ask_ai(q):
    prompt = f"### Instruction:\n{q}\n\n### Response:\n"
    output = llm(prompt)
    return output.strip()

while True:
    q = input("Ask Anything (or exit): ")
    if q.lower() == "exit":
        break

    answer = ask_ai(q)
    print("\nAI:", answer, "\n")


Loading model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Model loaded!

Ask Anything (or exit): what is the use of self attention in transformers architecture?

AI: Self attention plays a crucial role in transformer architectures, as it allows the model to attend to specific parts of the input sequence and focus on them. This helps the model to generate better and more contextualized predictions, especially when dealing with long documents or multitask scenarios such as parsing large text corpora. Self attention can also help improve the model's performance by reducing the number of parameters required for each layer, making it easier to train and scale for large models on complex problems like machine translation. 

Ask Anything (or exit): what is the use of cross attention in transformers

AI: Cross attention is a technique used in transformer models to improve the model's ability to understand and predict the contextual information that can be found in multiple sources, including those from different modalities. This helps the model to be