# Pipeline Exploration Notebook

This notebook provides interactive examples for exploring Hugging Face pipelines.

In [None]:
import sys
sys.path.append('..')

from transformers import pipeline, logging
import torch
from src.config import get_device, DEFAULT_SENTIMENT_MODEL

## 1. Basic Pipeline Usage

In [None]:
# Create a simple pipeline
device = get_device()
print(f"Using device: {device}")

clf = pipeline(
    'sentiment-analysis',
    model=DEFAULT_SENTIMENT_MODEL,
    device=0 if device == 'cuda' else -1
)

In [None]:
# Test the pipeline
texts = [
    "I love this product!",
    "This is terrible.",
    "It's okay, not great."
]

results = clf(texts)
for text, result in zip(texts, results):
    print(f"{text}: {result}")

## 2. Pipeline Internals

In [None]:
# Inspect pipeline components
print("Model architecture:")
print(clf.model)

print("\nTokenizer info:")
print(f"Vocab size: {clf.tokenizer.vocab_size}")
print(f"Max length: {clf.tokenizer.model_max_length}")

## 3. Custom Pipeline Creation

In [None]:
from src.custom_pipelines import CustomSentimentPipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Create custom pipeline
model = AutoModelForSequenceClassification.from_pretrained(DEFAULT_SENTIMENT_MODEL)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_SENTIMENT_MODEL)

custom_pipe = CustomSentimentPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0 if device == 'cuda' else -1
)

# Test with messy input
messy_texts = [
    "<p>AMAZING PRODUCT!!!</p>",
    "terrible... just terrible!!!!!!",
    "   Good value   "
]

custom_results = custom_pipe(messy_texts)
for text, result in zip(messy_texts, custom_results):
    print(f"\nInput: {text}")
    print(f"Result: {result}")

## 4. Performance Comparison

In [None]:
import time

# Generate test data
test_texts = ["This is a test sentence."] * 100

# Test different batch sizes
batch_sizes = [1, 8, 16, 32]

for batch_size in batch_sizes:
    start = time.time()
    _ = clf(test_texts, batch_size=batch_size)
    end = time.time()
    
    throughput = len(test_texts) / (end - start)
    print(f"Batch size {batch_size}: {throughput:.1f} samples/sec")