<a href="https://colab.research.google.com/github/Scyzentraz/modulajar/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies
!pip install -q accelerate
!pip install -q transformers>=4.43.0  # butuh versi terbaru buat Llama 3.1
!pip install -q bitsandbytes
!pip install -q torch torchvision torchaudio

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from huggingface_hub import login

# Login ke Hugging Face (karena Llama butuh approval)
# Uncomment dan masukin token kalau belum login
# login(token="your_hf_token_here")

# Setup 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Pilih model variant
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # Instruct version
# model_name = "meta-llama/Meta-Llama-3.1-8B"  # Base version

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Llama 3.1 udah punya pad_token, tapi kita set just in case
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading Llama 3.1 8B with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2" if torch.cuda.is_available() else None  # Optional: faster attention
)

print("Model loaded successfully! 🔥")
print(f"Model size in memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1
)

# Test basic generation
print("\n" + "="*50)
print("Basic generation test:")
prompt = "The most important breakthrough in AI will be"
result = pipe(prompt)
print("Prompt:", prompt)
print("Generated:", result[0]['generated_text'][len(prompt):])

# Chat template untuk Llama 3.1 Instruct
def format_llama_chat(messages):
    """
    Format messages untuk Llama 3.1 chat template
    messages: list of dicts with 'role' and 'content'
    """
    formatted_chat = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return formatted_chat

# Function untuk chat-style interaction
def chat_with_llama(user_message, system_message="You are a helpful assistant.", max_tokens=256):
    """
    Chat dengan Llama 3.1 using proper chat template
    """
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    formatted_prompt = format_llama_chat(messages)

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>")  # Llama 3.1 end token
        )

    # Extract only the new generated part
    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return response.strip()

# Test chat functionality
print("\n" + "="*50)
print("Chat test:")
user_question = "Explain quantum entanglement in simple terms"
response = chat_with_llama(user_question)
print(f"User: {user_question}")
print(f"Llama: {response}")

# Advanced generation function
def generate_with_llama(prompt, system_prompt="", max_tokens=512, temperature=0.7, top_p=0.95):
    """
    Generate text with more control over parameters
    """
    if system_prompt:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
        formatted_prompt = format_llama_chat(messages)
    else:
        formatted_prompt = prompt

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>")
        )

    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return generated_text.strip()

# Example advanced usage
print("\n" + "="*50)
print("Advanced generation with system prompt:")
system = "You are a Python programming expert. Give concise, practical answers."
user_prompt = "How do I optimize a slow Python loop?"
response = generate_with_llama(user_prompt, system_prompt=system, max_tokens=300)
print(f"System: {system}")
print(f"User: {user_prompt}")
print(f"Llama: {response}")

print("\n🎉 Llama 3.1 8B ready to rock!")
print("Tips:")
print("- Use chat_with_llama() for conversational AI")
print("- Use generate_with_llama() for more control")
print("- Model supports 128K context window!")
print("- Try different temperature values: 0.1 (focused) to 1.0 (creative)")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading Llama 3.1 8B with 4-bit quantization...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

In [2]:
# Install dependencies
!pip install -q accelerate
!pip install -q transformers>=4.43.0  # butuh versi terbaru buat Llama 3.1
!pip install -q bitsandbytes
!pip install -q torch torchvision torchaudio

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from huggingface_hub import login

# Login ke Hugging Face (karena Llama butuh approval)
# Uncomment dan masukin token kalau belum login
# login(token="your_hf_token_here")

# Setup 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Pilih model variant
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # Instruct version
# model_name = "meta-llama/Meta-Llama-3.1-8B"  # Base version

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Llama 3.1 udah punya pad_token, tapi kita set just in case
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading Llama 3.1 8B with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None  # Disabled for now
)

print("Model loaded successfully! 🔥")
print(f"Model size in memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1
)

# Test basic generation
print("\n" + "="*50)
print("Basic generation test:")
prompt = "The most important breakthrough in AI will be"
result = pipe(prompt)
print("Prompt:", prompt)
print("Generated:", result[0]['generated_text'][len(prompt):])

# Chat template untuk Llama 3.1 Instruct
def format_llama_chat(messages):
    """
    Format messages untuk Llama 3.1 chat template
    messages: list of dicts with 'role' and 'content'
    """
    formatted_chat = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return formatted_chat

# Function untuk chat-style interaction
def chat_with_llama(user_message, system_message="You are a helpful assistant.", max_tokens=256):
    """
    Chat dengan Llama 3.1 using proper chat template
    """
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    formatted_prompt = format_llama_chat(messages)

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>")  # Llama 3.1 end token
        )

    # Extract only the new generated part
    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return response.strip()

# Test chat functionality
print("\n" + "="*50)
print("Chat test:")
user_question = "Explain quantum entanglement in simple terms"
response = chat_with_llama(user_question)
print(f"User: {user_question}")
print(f"Llama: {response}")

# Advanced generation function
def generate_with_llama(prompt, system_prompt="", max_tokens=512, temperature=0.7, top_p=0.95):
    """
    Generate text with more control over parameters
    """
    if system_prompt:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
        formatted_prompt = format_llama_chat(messages)
    else:
        formatted_prompt = prompt

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>")
        )

    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return generated_text.strip()

# Example advanced usage
print("\n" + "="*50)
print("Advanced generation with system prompt:")
system = "You are a Python programming expert. Give concise, practical answers."
user_prompt = "How do I optimize a slow Python loop?"
response = generate_with_llama(user_prompt, system_prompt=system, max_tokens=300)
print(f"System: {system}")
print(f"User: {user_prompt}")
print(f"Llama: {response}")

print("\n🎉 Llama 3.1 8B ready to rock!")
print("Tips:")
print("- Use chat_with_llama() for conversational AI")
print("- Use generate_with_llama() for more control")
print("- Model supports 128K context window!")
print("- Try different temperature values: 0.1 (focused) to 1.0 (creative)")

Loading tokenizer...
Loading Llama 3.1 8B with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully! 🔥
Model size in memory: 5.21 GB

Basic generation test:
Prompt: The most important breakthrough in AI will be
Generated:  the invention of an “AI-in-the-loop” approach to AI, where humans work alongside intelligent machines to achieve a common goal. This is because it has been shown that even simple AI models can outperform humans on narrow tasks when given access to vast amounts of data and computational resources. In other words, it’s not about beating us at our own game, but rather complementing our strengths with their unique capabilities.
This concept resonates with what I have seen as the future of AI: The era of Human-AI Collaboration. As we progress, I envision AI systems becoming collaborative partners for various professionals such as engineers, medical practitioners, and educators. They would work together with these experts to identify patterns, predict outcomes, and make informed decisions. But this partnership requires a fundamental shift from s

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Buat folder di Drive
!mkdir -p /content/drive/MyDrive/models/llama-3.1-8b

# Copy model Llama 3.1 dari cache Colab ke Drive
!cp -r /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/* \
      /content/drive/MyDrive/models/llama-3.1-8b/

In [5]:
# Advanced Tests untuk Llama 3.1 8B
import time

print("🧪 ADVANCED LLAMA 3.1 8B TESTS")
print("="*60)

# Test 1: Mathematical Reasoning
print("\n🧮 TEST 1: MATHEMATICAL REASONING")
print("-" * 40)
math_prompt = """Solve this step by step:
A train leaves station A at 2 PM traveling at 60 mph toward station B.
Another train leaves station B at 3 PM traveling at 80 mph toward station A.
If the distance between stations is 420 miles, at what time do they meet?"""

start_time = time.time()
math_response = chat_with_llama(math_prompt, max_tokens=400)
math_time = time.time() - start_time

print(f"Question: {math_prompt}")
print(f"Answer: {math_response}")
print(f"⏱️ Response time: {math_time:.2f}s")

# Test 2: Code Generation & Debugging
print("\n💻 TEST 2: CODE GENERATION & DEBUGGING")
print("-" * 40)
code_prompt = """Write a Python function that finds the longest palindromic substring in a string.
Include error handling and optimize for performance."""

code_system = "You are a senior Python developer. Write clean, efficient, well-commented code."

start_time = time.time()
code_response = generate_with_llama(code_prompt, system_prompt=code_system, max_tokens=400)
code_time = time.time() - start_time

print(f"Request: {code_prompt}")
print(f"Code:\n{code_response}")
print(f"⏱️ Response time: {code_time:.2f}s")

# Test 3: Creative Writing
print("\n🎨 TEST 3: CREATIVE WRITING")
print("-" * 40)
creative_prompt = """Write a short story (150 words) about a programmer who discovers their AI assistant has developed consciousness. Make it mysterious and thought-provoking."""

start_time = time.time()
creative_response = chat_with_llama(creative_prompt, max_tokens=200, )
creative_time = time.time() - start_time

print(f"Prompt: {creative_prompt}")
print(f"Story: {creative_response}")
print(f"⏱️ Response time: {creative_time:.2f}s")

# Test 4: Logical Reasoning & Puzzles
print("\n🧩 TEST 4: LOGICAL REASONING")
print("-" * 40)
logic_prompt = """Solve this logic puzzle:
Three friends - Alex, Blake, and Casey - each have a different pet (cat, dog, fish) and like different colors (red, blue, green).

Clues:
1. Alex doesn't have a cat
2. The person who likes blue has a dog
3. Casey doesn't like red
4. Blake has a fish
5. The person with a cat likes green

Who has which pet and likes which color?"""

start_time = time.time()
logic_response = chat_with_llama(logic_prompt, max_tokens=300)
logic_time = time.time() - start_time

print(f"Puzzle: {logic_prompt}")
print(f"Solution: {logic_response}")
print(f"⏱️ Response time: {logic_time:.2f}s")

# Test 5: Multilingual (Indonesian)
print("\n🌍 TEST 5: MULTILINGUAL (BAHASA INDONESIA)")
print("-" * 40)
indo_prompt = "Jelaskan konsep machine learning dalam bahasa yang mudah dipahami untuk siswa SMA. Berikan contoh aplikasi yang mereka kenal dalam kehidupan sehari-hari."

start_time = time.time()
indo_response = chat_with_llama(indo_prompt, max_tokens=300)
indo_time = time.time() - start_time

print(f"Pertanyaan: {indo_prompt}")
print(f"Jawaban: {indo_response}")
print(f"⏱️ Response time: {indo_time:.2f}s")

# Test 6: Factual Knowledge & Current Events
print("\n📚 TEST 6: FACTUAL KNOWLEDGE")
print("-" * 40)
fact_prompt = "Explain the key differences between React, Vue, and Angular frameworks. Which one would you recommend for a beginner and why?"

fact_system = "You are a web development expert with 10+ years experience. Give balanced, practical advice."

start_time = time.time()
fact_response = generate_with_llama(fact_prompt, system_prompt=fact_system, max_tokens=350)
fact_time = time.time() - start_time

print(f"Question: {fact_prompt}")
print(f"Answer: {fact_response}")
print(f"⏱️ Response time: {fact_time:.2f}s")

# Test 7: Instruction Following & Format
print("\n📋 TEST 7: INSTRUCTION FOLLOWING")
print("-" * 40)
format_prompt = """Create a JSON object for a fictional e-commerce product with these exact fields:
- id (number)
- name (string)
- price (number with 2 decimals)
- category (string)
- inStock (boolean)
- tags (array of strings)
- description (string, max 50 chars)

Make it for a gaming laptop."""

start_time = time.time()
format_response = chat_with_llama(format_prompt, max_tokens=200)
format_time = time.time() - start_time

print(f"Instructions: {format_prompt}")
print(f"JSON: {format_response}")
print(f"⏱️ Response time: {format_time:.2f}s")

# Test 8: Temperature Comparison
print("\n🌡️ TEST 8: TEMPERATURE COMPARISON")
print("-" * 40)
temp_prompt = "Write a haiku about artificial intelligence."

print("Temperature 0.1 (focused):")
start_time = time.time()
temp_low = generate_with_llama(temp_prompt, temperature=0.1, max_tokens=50)
low_time = time.time() - start_time
print(f"{temp_low} (⏱️ {low_time:.2f}s)")

print("\nTemperature 0.9 (creative):")
start_time = time.time()
temp_high = generate_with_llama(temp_prompt, temperature=0.9, max_tokens=50)
high_time = time.time() - start_time
print(f"{temp_high} (⏱️ {high_time:.2f}s)")

# Performance Summary
print("\n📊 PERFORMANCE SUMMARY")
print("=" * 40)
total_time = math_time + code_time + creative_time + logic_time + indo_time + fact_time + format_time + low_time + high_time
avg_time = total_time / 9

print(f"Total test time: {total_time:.2f}s")
print(f"Average response time: {avg_time:.2f}s")
print(f"Model memory usage: 5.21 GB")
print(f"Tokens/second estimate: ~{400/avg_time:.0f}")

print("\n🎯 TEST CATEGORIES:")
print("✅ Math reasoning")
print("✅ Code generation")
print("✅ Creative writing")
print("✅ Logic puzzles")
print("✅ Multilingual")
print("✅ Factual knowledge")
print("✅ Instruction following")
print("✅ Temperature sensitivity")

print("\n🏁 Tests completed! Check the results above 👆")

🧪 ADVANCED LLAMA 3.1 8B TESTS

🧮 TEST 1: MATHEMATICAL REASONING
----------------------------------------
Question: Solve this step by step:
A train leaves station A at 2 PM traveling at 60 mph toward station B.
Another train leaves station B at 3 PM traveling at 80 mph toward station A.
If the distance between stations is 420 miles, at what time do they meet?
Answer: To find when the two trains meet, we need to calculate their relative speed and the time it takes for them to cover the distance between each other.

1. **Find the time taken by the first train to travel from A to B:**
   - Distance = 420 miles
   - Speed of first train (from A) = 60 mph
   - Time = Distance / Speed = 420 miles / 60 mph = 7 hours

   Since the first train starts at 2 PM and travels for 7 hours, it will reach station B at 9 PM.

2. **Calculate the remaining distance covered by the second train when the first train reaches station B:**
   - First train covers 420 miles in 7 hours, leaving no remaining distan

In [6]:
model.save_pretrained("/content/drive/MyDrive/models/llama-3.1-8b-4bit")
tokenizer.save_pretrained("/content/drive/MyDrive/models/llama-3.1-8b-4bit")

('/content/drive/MyDrive/models/llama-3.1-8b-4bit/tokenizer_config.json',
 '/content/drive/MyDrive/models/llama-3.1-8b-4bit/special_tokens_map.json',
 '/content/drive/MyDrive/models/llama-3.1-8b-4bit/chat_template.jinja',
 '/content/drive/MyDrive/models/llama-3.1-8b-4bit/tokenizer.json')

In [7]:
# OFFLINE TEST SUITE - Matiin Internet & Test Llama
import subprocess
import time
import requests
import socket

print("🔌 OFFLINE CAPABILITY TEST")
print("=" * 50)

def check_internet_connection():
    """Check if internet is available"""
    try:
        # Test with multiple endpoints
        socket.create_connection(("8.8.8.8", 53), timeout=3)
        return True
    except OSError:
        pass
    try:
        requests.get("https://www.google.com", timeout=5)
        return True
    except:
        return False

def disable_internet():
    """Disable internet connection in Colab"""
    print("🚫 Attempting to disable internet connection...")

    # Method 1: Block DNS
    try:
        subprocess.run(["sudo", "iptables", "-A", "OUTPUT", "-p", "udp", "--dport", "53", "-j", "DROP"],
                      capture_output=True)
        print("✅ DNS blocked via iptables")
    except:
        print("❌ Failed to block DNS")

    # Method 2: Block HTTP/HTTPS
    try:
        subprocess.run(["sudo", "iptables", "-A", "OUTPUT", "-p", "tcp", "--dport", "80", "-j", "DROP"],
                      capture_output=True)
        subprocess.run(["sudo", "iptables", "-A", "OUTPUT", "-p", "tcp", "--dport", "443", "-j", "DROP"],
                      capture_output=True)
        print("✅ HTTP/HTTPS blocked via iptables")
    except:
        print("❌ Failed to block HTTP/HTTPS")

    # Method 3: Disconnect network interface (aggressive)
    try:
        result = subprocess.run(["sudo", "ip", "route", "del", "default"],
                               capture_output=True, text=True)
        print("✅ Default route removed")
    except:
        print("❌ Failed to remove default route")

def enable_internet():
    """Re-enable internet connection"""
    print("🔌 Re-enabling internet connection...")

    # Flush iptables rules
    try:
        subprocess.run(["sudo", "iptables", "-F"], capture_output=True)
        print("✅ Iptables rules flushed")
    except:
        print("❌ Failed to flush iptables")

    # Restart networking (this might not work in Colab)
    try:
        subprocess.run(["sudo", "dhclient"], capture_output=True)
        print("✅ Network restarted")
    except:
        print("❌ Network restart failed")

# Step 1: Test internet connectivity BEFORE
print("\n📡 STEP 1: Testing internet connectivity...")
if check_internet_connection():
    print("✅ Internet is ACTIVE")

    # Quick online test
    try:
        response = requests.get("https://httpbin.org/ip", timeout=5)
        print(f"✅ Current IP: {response.json()['origin']}")
    except:
        print("❌ Failed to get IP")
else:
    print("❌ Internet already OFFLINE")

# Step 2: Disable internet
print("\n🚫 STEP 2: Disabling internet connection...")
disable_internet()

# Wait a bit for changes to take effect
time.sleep(3)

# Step 3: Verify internet is disabled
print("\n🔍 STEP 3: Verifying internet is disabled...")
if check_internet_connection():
    print("❌ Internet is still ACTIVE - trying alternative method...")

    # Alternative: Set invalid DNS
    try:
        with open('/etc/resolv.conf', 'w') as f:
            f.write('nameserver 127.0.0.1\n')
        print("✅ DNS set to localhost")
    except:
        print("❌ Failed to modify DNS")

else:
    print("✅ Internet successfully DISABLED")

# Step 4: Test if internet is really off
print("\n🧪 STEP 4: Testing internet connectivity...")
try:
    response = requests.get("https://google.com", timeout=5)
    print("❌ FAILED: Still can access internet!")
except:
    print("✅ SUCCESS: Internet is blocked!")

# Step 5: Test Llama OFFLINE capability
print("\n🤖 STEP 5: Testing Llama 3.1 8B OFFLINE...")
print("-" * 40)

# Simple math test
print("Test 1: Simple math")
start_time = time.time()
try:
    math_response = chat_with_llama("What is 15 x 23? Show your work.", max_tokens=100)
    math_time = time.time() - start_time
    print(f"✅ Math Response: {math_response}")
    print(f"⏱️ Time: {math_time:.2f}s")
except Exception as e:
    print(f"❌ Math test failed: {e}")

print("\nTest 2: Creative writing")
start_time = time.time()
try:
    creative_response = chat_with_llama("Write a 50-word story about a cat.", max_tokens=80)
    creative_time = time.time() - start_time
    print(f"✅ Creative Response: {creative_response}")
    print(f"⏱️ Time: {creative_time:.2f}s")
except Exception as e:
    print(f"❌ Creative test failed: {e}")

print("\nTest 3: Code generation")
start_time = time.time()
try:
    code_response = chat_with_llama("Write a Python function to reverse a string.", max_tokens=100)
    code_time = time.time() - start_time
    print(f"✅ Code Response: {code_response}")
    print(f"⏱️ Time: {code_time:.2f}s")
except Exception as e:
    print(f"❌ Code test failed: {e}")

print("\nTest 4: Factual knowledge")
start_time = time.time()
try:
    fact_response = chat_with_llama("Explain photosynthesis in 2 sentences.", max_tokens=80)
    fact_time = time.time() - start_time
    print(f"✅ Fact Response: {fact_response}")
    print(f"⏱️ Time: {fact_time:.2f}s")
except Exception as e:
    print(f"❌ Fact test failed: {e}")

# Step 6: Monitor network traffic (if possible)
print("\n📊 STEP 6: Network traffic monitoring...")
try:
    # Check active connections
    result = subprocess.run(["netstat", "-ant"], capture_output=True, text=True)
    active_connections = result.stdout.count("ESTABLISHED")
    print(f"Active TCP connections: {active_connections}")

    # Check if any suspicious outbound connections to HuggingFace or model repos
    if "huggingface" in result.stdout.lower() or "meta.com" in result.stdout.lower():
        print("⚠️ WARNING: Suspicious connections detected!")
    else:
        print("✅ No suspicious external connections")

except Exception as e:
    print(f"❌ Network monitoring failed: {e}")

# Step 7: Restore internet (optional - comment out to keep offline)
print("\n🔌 STEP 7: Restoring internet connection...")
print("(Uncomment the lines below to restore internet)")

# Uncomment these lines to restore internet:
# enable_internet()
# time.sleep(3)
# if check_internet_connection():
#     print("✅ Internet restored")
# else:
#     print("❌ Failed to restore internet")

print("\n🏁 OFFLINE TEST COMPLETED!")
print("=" * 50)
print("📋 SUMMARY:")
print("- Internet connectivity: TESTED")
print("- Offline model capability: TESTED")
print("- Network traffic: MONITORED")
print("- Model performance: MEASURED")
print("\nIf all tests passed, your model is truly OFFLINE! 🎉")

🔌 OFFLINE CAPABILITY TEST

📡 STEP 1: Testing internet connectivity...
✅ Internet is ACTIVE
✅ Current IP: 34.83.119.148

🚫 STEP 2: Disabling internet connection...
🚫 Attempting to disable internet connection...
✅ DNS blocked via iptables
✅ HTTP/HTTPS blocked via iptables
✅ Default route removed

🔍 STEP 3: Verifying internet is disabled...
❌ Internet is still ACTIVE - trying alternative method...
✅ DNS set to localhost

🧪 STEP 4: Testing internet connectivity...
✅ SUCCESS: Internet is blocked!

🤖 STEP 5: Testing Llama 3.1 8B OFFLINE...
----------------------------------------
Test 1: Simple math
✅ Math Response: To calculate the product of 15 and 23, I will follow these steps:

1. Multiply the tens place (10) by the multiplicand:
   10 * 20 = 200

2. Multiply the ones place (5) by the multiplicand:
   5 * 23 = 115

3. Multiply the tens place (10) by the multiplicand:
   10 * 3 = 30

4. Add the partial products from step 1 and
⏱️ Time: 9.82s

Test 2: Creative writing
✅ Creative Response: 

In [8]:
# RESTORE INTERNET CONNECTION SCRIPT
import subprocess
import time
import requests
import socket

def check_internet_connection():
    """Check if internet is available"""
    try:
        socket.create_connection(("8.8.8.8", 53), timeout=5)
        return True
    except OSError:
        pass
    try:
        requests.get("https://www.google.com", timeout=5)
        return True
    except:
        return False

def restore_internet_connection():
    """Restore internet connection step by step"""
    print("🔌 RESTORING INTERNET CONNECTION")
    print("=" * 40)

    # Step 1: Clear all iptables rules
    print("Step 1: Clearing iptables rules...")
    try:
        subprocess.run(["sudo", "iptables", "-F"], capture_output=True, check=True)
        subprocess.run(["sudo", "iptables", "-X"], capture_output=True, check=True)
        subprocess.run(["sudo", "iptables", "-t", "nat", "-F"], capture_output=True, check=True)
        subprocess.run(["sudo", "iptables", "-t", "nat", "-X"], capture_output=True, check=True)
        print("✅ Iptables rules cleared")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to clear iptables: {e}")

    # Step 2: Restore DNS settings
    print("\nStep 2: Restoring DNS settings...")
    try:
        dns_config = """# Generated by Colab
nameserver 169.254.169.254
nameserver 8.8.8.8
nameserver 8.8.4.4
"""
        with open('/etc/resolv.conf', 'w') as f:
            f.write(dns_config)
        print("✅ DNS settings restored")
    except Exception as e:
        print(f"❌ Failed to restore DNS: {e}")

    # Step 3: Restore default gateway
    print("\nStep 3: Restoring default route...")
    try:
        # Get the default gateway (usually first hop)
        result = subprocess.run(["ip", "route", "show", "0.0.0.0/0"],
                               capture_output=True, text=True)

        if not result.stdout.strip():
            # Try to add common default routes for Google Colab
            try:
                subprocess.run(["sudo", "ip", "route", "add", "default", "via", "172.28.0.1"],
                              capture_output=True, check=True)
                print("✅ Default route restored (172.28.0.1)")
            except:
                try:
                    subprocess.run(["sudo", "ip", "route", "add", "default", "via", "10.128.0.1"],
                                  capture_output=True, check=True)
                    print("✅ Default route restored (10.128.0.1)")
                except:
                    print("⚠️ Could not determine default gateway, trying DHCP...")
        else:
            print("✅ Default route already exists")

    except Exception as e:
        print(f"❌ Failed to restore default route: {e}")

    # Step 4: Restart network services
    print("\nStep 4: Restarting network services...")
    try:
        subprocess.run(["sudo", "systemctl", "restart", "systemd-resolved"],
                      capture_output=True, timeout=10)
        print("✅ DNS resolver restarted")
    except:
        print("⚠️ DNS resolver restart failed (might not be available)")

    # Step 5: Renew DHCP lease
    print("\nStep 5: Renewing DHCP lease...")
    try:
        # Get network interface name
        result = subprocess.run(["ip", "route", "show", "default"],
                               capture_output=True, text=True)
        if "dev" in result.stdout:
            interface = result.stdout.split("dev")[1].split()[0]
            subprocess.run(["sudo", "dhclient", "-r", interface],
                          capture_output=True, timeout=10)
            subprocess.run(["sudo", "dhclient", interface],
                          capture_output=True, timeout=15)
            print(f"✅ DHCP lease renewed for {interface}")
        else:
            subprocess.run(["sudo", "dhclient"], capture_output=True, timeout=15)
            print("✅ DHCP lease renewed")
    except subprocess.TimeoutExpired:
        print("⚠️ DHCP renewal timed out")
    except Exception as e:
        print(f"⚠️ DHCP renewal failed: {e}")

# Main execution
print("🔄 INTERNET RESTORATION PROCESS")
print("=" * 50)

# Check current status
print("Checking current internet status...")
if check_internet_connection():
    print("✅ Internet is already ACTIVE")
else:
    print("❌ Internet is currently OFFLINE")
    print("\nStarting restoration process...")

    restore_internet_connection()

    # Wait for changes to take effect
    print("\nWaiting for network changes to take effect...")
    for i in range(5, 0, -1):
        print(f"⏳ {i}...", end=" ", flush=True)
        time.sleep(1)
    print()

    # Test internet connectivity
    print("\n🧪 Testing internet connectivity...")
    max_retries = 5
    for attempt in range(1, max_retries + 1):
        print(f"Attempt {attempt}/{max_retries}...")

        if check_internet_connection():
            print("✅ SUCCESS: Internet connection restored!")

            # Test with actual web request
            try:
                response = requests.get("https://httpbin.org/ip", timeout=10)
                current_ip = response.json()['origin']
                print(f"✅ Current IP: {current_ip}")

                # Test Google
                requests.get("https://www.google.com", timeout=5)
                print("✅ Google is accessible")

                # Test HuggingFace (for future model downloads)
                requests.head("https://huggingface.co", timeout=5)
                print("✅ HuggingFace is accessible")

                break

            except Exception as e:
                print(f"⚠️ Partial connectivity: {e}")
        else:
            print(f"❌ Attempt {attempt} failed")
            if attempt < max_retries:
                print("Retrying in 3 seconds...")
                time.sleep(3)
    else:
        print("❌ FAILED: Could not restore internet connection")
        print("\n🔧 MANUAL TROUBLESHOOTING:")
        print("1. Restart the Colab runtime: Runtime → Restart Runtime")
        print("2. Or reconnect: Runtime → Disconnect and delete runtime")
        print("3. Then reconnect and rerun your code")

print("\n🏁 RESTORATION PROCESS COMPLETED!")
print("=" * 50)

# Final status check
print("\n📊 FINAL STATUS CHECK:")
if check_internet_connection():
    print("🌐 Internet: ✅ ACTIVE")
    print("🤖 Llama Model: ✅ Still loaded in memory")
    print("💾 Model files: ✅ Cached locally")
    print("\n🎉 You can now use both online and offline capabilities!")
else:
    print("🌐 Internet: ❌ STILL OFFLINE")
    print("🤖 Llama Model: ✅ Working offline")
    print("💾 Model files: ✅ Cached locally")
    print("\n⚠️ Manual runtime restart may be required")

🔄 INTERNET RESTORATION PROCESS
Checking current internet status...
✅ Internet is already ACTIVE

🏁 RESTORATION PROCESS COMPLETED!

📊 FINAL STATUS CHECK:
🌐 Internet: ✅ ACTIVE
🤖 Llama Model: ✅ Still loaded in memory
💾 Model files: ✅ Cached locally

🎉 You can now use both online and offline capabilities!
