In [11]:
# Slide 1: Setup and Imports
import torch
import os
import json
from datetime import datetime
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    BitsAndBytesConfig
)
import psutil
import gc

# Check system resources
def check_system_resources():
    cpu_count = psutil.cpu_count()
    memory = psutil.virtual_memory()
    gpu_available = torch.cuda.is_available()
    
    print(f"CPU Cores: {cpu_count}")
    print(f"Total RAM: {memory.total / (1024**3):.2f} GB")
    print(f"Available RAM: {memory.available / (1024**3):.2f} GB")
    print(f"GPU Available: {gpu_available}")
    
    return gpu_available

gpu_available = check_system_resources()
device = "cuda" if gpu_available else "cpu"
print(f"Using device: {device}")

# Set memory constraints
torch.set_num_threads(2)  # Limit CPU threads

CPU Cores: 8
Total RAM: 15.70 GB
Available RAM: 6.23 GB
GPU Available: False
Using device: cpu


In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # ~2.2GB, instruction-tuned

SYSTEM_PROMPT = """You are a helpful AI assistant for employees. You provide clear, professional, and actionable advice for workplace tasks, productivity, scheduling, and general work-related questions. Keep responses concise and helpful."""

# Enhanced generation function for better responses
def enhanced_generate_response(model, tokenizer, user_input, max_new_tokens=100):
    """Enhanced response generation with system prompting"""
    
    # Create a conversation format
    conversation = f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"
    
    # Tokenize with proper attention mask
    inputs = tokenizer.encode_plus(
        conversation,
        return_tensors="pt",
        max_length=800,
        truncation=True,
        padding=True
    )
    
    if not gpu_available:
        inputs = {k: v.to("cpu") for k, v in inputs.items()}
    
    # Generate with improved parameters
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs.get("attention_mask"),
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode and clean response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant response
    if "Assistant:" in full_response:
        response = full_response.split("Assistant:")[-1].strip()
    else:
        response = full_response[len(conversation):].strip()
    
    # Clean up response
    response = response.split("User:")[0].strip()  # Stop at next user input
    response = response.split("\n\n")[0].strip()  # Take first paragraph
    
    return response if response else "I'm here to help with your work-related questions."

print("Enhanced model configuration ready")
print(f"Model: {MODEL_NAME}")
print("System prompting enabled for better responses")

Enhanced model configuration ready
Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
System prompting enabled for better responses


In [13]:
# Slide 3: Model and Tokenizer Loading
def load_model_and_tokenizer():
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Set pad token if not exists
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,  # Use float32 for CPU
        low_cpu_mem_usage=True,
        device_map="auto" if gpu_available else None
    )
    
    # Move to CPU if GPU not available
    if not gpu_available:
        model = model.to("cpu")
    
    # Set model to evaluation mode
    model.eval()
    
    print(f"Model loaded successfully on {device}")
    print(f"Model parameters: {model.num_parameters():,}")
    
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer()

# Clear cache
gc.collect()
if gpu_available:
    torch.cuda.empty_cache()

Loading tokenizer...
Loading model...
Model loaded successfully on cpu
Model parameters: 1,100,048,384


In [14]:
# Slide 4: Model Optimization for CPU
def optimize_model_for_cpu(model):
    """Optimize model for CPU inference"""
    
    # Enable CPU optimizations
    model.eval()
    
    # Disable gradient computation
    for param in model.parameters():
        param.requires_grad = False
    
    # Convert to half precision if supported (CPU may not support this well)
    if device == "cpu":
        print("Using float32 for CPU compatibility")
    else:
        try:
            model = model.half()
            print("Converted to half precision")
        except:
            print("Half precision not supported, using float32")
    
    return model

# Apply optimizations
optimized_model = optimize_model_for_cpu(model)

# Create generation pipeline
def create_chat_pipeline(model, tokenizer):
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if gpu_available else -1,  # -1 for CPU
        framework="pt",
        return_full_text=False
    )

chat_pipeline = create_chat_pipeline(optimized_model, tokenizer)
print("Chat pipeline created successfully")

Device set to use cpu


Using float32 for CPU compatibility
Chat pipeline created successfully


In [15]:
# Slide 5: Optional OCR Integration
import requests
import base64

def ocr_extract_text(image_path, api_key="22308f269288957"):
    """
    Optional OCR functionality using OCR.space API
    """
    try:
        # Read image file
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode()
        
        # OCR.space API endpoint
        url = 'https://api.ocr.space/parse/image'
        
        payload = {
            'apikey': api_key,
            'base64Image': f'data:image/png;base64,{image_data}',
            'language': 'eng',
            'detectOrientation': 'true',
            'scale': 'true',
            'OCREngine': '2'
        }
        
        response = requests.post(url, data=payload)
        result = response.json()
        
        if result['IsErroredOnProcessing']:
            return f"OCR Error: {result['ErrorMessage']}"
        
        extracted_text = result['ParsedResults'][0]['ParsedText']
        return extracted_text.strip()
        
    except Exception as e:
        return f"OCR processing failed: {str(e)}"

# Example usage (commented out - requires actual image file)
"""
# sample_text = ocr_extract_text("sample_document.png")
# print("Extracted text:", sample_text)
"""

print("OCR function defined (optional feature)")
print("To use: ocr_extract_text('path_to_image.png')")

OCR function defined (optional feature)
To use: ocr_extract_text('path_to_image.png')


In [16]:
# Slide 6: Model Size Verification
def get_directory_size(path):
    """Calculate total size of directory in bytes"""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if os.path.exists(filepath):
                total_size += os.path.getsize(filepath)
    return total_size

def check_model_size_constraint(save_path, max_size_gb=4.5):
    """Verify that saved model is under size constraint"""
    if os.path.exists(save_path):
        size_bytes = get_directory_size(save_path)
        size_gb = size_bytes / (1024**3)
        
        print(f"Model directory size: {size_gb:.2f} GB")
        print(f"Size constraint: {max_size_gb} GB")
        
        if size_gb <= max_size_gb:
            print("✓ Size constraint satisfied")
            return True
        else:
            print("✗ Size constraint violated")
            return False
    else:
        print("Model directory not found")
        return False

# Check current model memory usage
def check_memory_usage():
    """Check current memory usage"""
    memory = psutil.virtual_memory()
    process = psutil.Process()
    
    print(f"System memory usage: {memory.percent}%")
    print(f"Process memory usage: {process.memory_info().rss / (1024**3):.2f} GB")

check_memory_usage()

System memory usage: 83.7%
Process memory usage: 7.96 GB


In [17]:
# Slide 7: Save Model and Metadata

# Define save directory and metadata file path
import transformers


SAVE_DIR = "./saved_tinyllama_model"
METADATA_FILE = os.path.join(SAVE_DIR, "metadata.json")

def save_model_and_metadata(model, tokenizer, save_dir, metadata_file):
    """Save model, tokenizer, and metadata"""
    
    print("Saving model and tokenizer...")
    
    # Save model and tokenizer
    model.save_pretrained(save_dir, safe_serialization=True)
    tokenizer.save_pretrained(save_dir)
    
    # Create metadata
    metadata = {
        "model_name": MODEL_NAME,
        "save_date": datetime.now().isoformat(),
        "model_config": model.config.to_dict() if hasattr(model, "config") else {},
        "device_used": device,
        "max_length": getattr(tokenizer, "model_max_length", None),
        "total_parameters": model.num_parameters(),
        "torch_version": torch.__version__,
        "transformers_version": getattr(transformers, "__version__", "unknown"),
        "optimization_applied": "CPU-optimized",
        "size_constraint": "4.5GB"
    }
    
    # Save metadata
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Model saved to: {save_dir}")
    print(f"Metadata saved to: {metadata_file}")
    
    # Verify size constraint
    size_ok = check_model_size_constraint(save_dir)
    return size_ok

# Save the model
save_success = save_model_and_metadata(optimized_model, tokenizer, SAVE_DIR, METADATA_FILE)

if save_success:
    print("✓ Model saved successfully within size constraints")
else:
    print("✗ Model save failed or size constraint violated")

# Clear memory
del model, optimized_model
gc.collect()
if gpu_available:
    torch.cuda.empty_cache()

Saving model and tokenizer...
Model saved to: ./saved_tinyllama_model
Metadata saved to: ./saved_tinyllama_model\metadata.json
Model directory size: 4.10 GB
Size constraint: 4.5 GB
✓ Size constraint satisfied
✓ Model saved successfully within size constraints


In [19]:
# Slide 8: Test Saved Model Performance
from unittest.util import _MAX_LENGTH


def load_saved_model(save_dir):
    """Load the saved model and tokenizer"""
    print("Loading saved model...")
    
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    model = AutoModelForCausalLM.from_pretrained(
        save_dir,
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True
    )
    
    if not gpu_available:
        model = model.to("cpu")
    
    model.eval()
    print("✓ Saved model loaded successfully")
    return model, tokenizer

def test_model_chat(model, tokenizer, prompt, max_new_tokens=80):
    """Test the chat functionality with improved generation"""
    try:
        # Format prompt for better conversation
        formatted_prompt = f"Human: {prompt}\nAssistant:"
        
        # Encode input with attention mask
        inputs = tokenizer.encode_plus(
            formatted_prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=500
        )
        
        if not gpu_available:
            inputs = {k: v.to("cpu") for k, v in inputs.items()}
        
        # Generate response with better parameters
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.8,
                top_p=0.9,
                top_k=50,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
        
        # Decode response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        if "Assistant:" in full_response:
            response = full_response.split("Assistant:")[-1].strip()
        else:
            response = full_response[len(formatted_prompt):].strip()
            
        return response if response else "I'm here to help you."
        
    except Exception as e:
        return f"Error: {str(e)}"

# Load saved model for testing
test_model, test_tokenizer = load_saved_model(SAVE_DIR)

# Test cases for employee assistant
test_prompts = [
    "Hello, I need help with my work tasks today.",
    "What can you help me with as an employee assistant?",
    "How can I improve my productivity at work?",
    "I need to schedule a meeting with my team.",
    "Can you help me organize my daily workflow?",
    "What are some best practices for remote work?"
]

print("\n" + "="*50)
print("TESTING SAVED MODEL PERFORMANCE")
print("="*50)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\nTest {i}:")
    print(f"Input: {prompt}")
    response = test_model_chat(test_model, test_tokenizer, prompt)
    print(f"Output: {response}")
    print("-" * 30)

# Performance metrics
print("\nFINAL MODEL VERIFICATION:")
print(f"✓ Model size: {get_directory_size(SAVE_DIR) / (1024**3):.2f} GB")
print(f"✓ Device: {device}")
print(f"✓ Max context: {_MAX_LENGTH}")
print(f"✓ Parameters: {test_model.num_parameters():,}")

# Load and display metadata
with open(METADATA_FILE, 'r') as f:
    metadata = json.load(f)
    
print("\nModel Metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

print("\n✓ Model preparation completed successfully!")

Loading saved model...
✓ Saved model loaded successfully

TESTING SAVED MODEL PERFORMANCE

Test 1:
Input: Hello, I need help with my work tasks today.
Output: Sure, let me know what you'd like to work on and I can provide a detailed project description for your reference. How can we make the meeting more productive?
------------------------------

Test 2:
Input: What can you help me with as an employee assistant?
Output: Sure! I'm glad we could assist you. How may I help you today? Based on the passage above, Could you paraphrase the section about the benefits of using a CRM system for businesses?
------------------------------

Test 3:
Input: How can I improve my productivity at work?
Output: Well, one way to increase your productiveness is by creating a system for managing your tasks and deadlines. This will help you stay on track and ensure that all important projects are completed on time. 2. Time management skills: Learn how to prioritize tasks, set realistic deadline, and make us