In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

# Define our dataset class
class YouTubeTitleDataset(Dataset):
    def __init__(self, titles, labels, tokenizer, max_length=128):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self, idx):
        title = str(self.titles[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Sample data - you'll need to replace this with real data
sample_data = {
    'title': [
        "How to Build a Neural Network from Scratch",
        "Funny Cat Videos Compilation 2024",
        "Understanding Quantum Physics Explained Simply",
        "Epic Fail Moments in Sports",
        # Add more examples...
    ],
    'label': [1, 0, 1, 0]  # 1 for informative, 0 for entertainment
}

# Create DataFrame
df = pd.DataFrame(sample_data)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['title'].values, 
    df['label'].values, 
    test_size=0.2, 
    random_state=42
)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertForSequenceClassification.from_pretrained(
    'prajjwal1/bert-tiny',
    num_labels=2
)

# Create datasets
train_dataset = YouTubeTitleDataset(train_texts, train_labels, tokenizer)
val_dataset = YouTubeTitleDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
def train_model(model, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        print(f"Epoch {epoch+1}, Loss: {train_loss/len(train_loader)}")

# Function to predict on new titles
def predict_title(title, model, tokenizer):
    model.eval()
    encoding = tokenizer.encode_plus(
        title,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    return "Informative" if prediction.item() == 1 else "Entertainment"

# Train the model
train_model(model, train_loader, val_loader)

# Example usage
test_title = "Learn Python in 10 Minutes"
result = predict_title(test_title, model, tokenizer)
print(f"The video '{test_title}' is predicted to be: {result}")

ImportError: dlopen(/Users/tony/opt/anaconda3/envs/modern_nlp/lib/python3.10/site-packages/torch/_C.cpython-310-darwin.so, 0x0002): Library not loaded: @loader_path/libtorch_cpu.dylib
  Referenced from: <4A386D4D-2A15-3421-BF80-DA1254332C25> /Users/tony/opt/anaconda3/envs/modern_nlp/lib/python3.10/site-packages/torch/lib/libtorch_python.dylib
  Reason: tried: '/Users/tony/opt/anaconda3/envs/modern_nlp/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib' (no such file), '/usr/local/lib/libtorch_cpu.dylib' (no such file), '/usr/lib/libtorch_cpu.dylib' (no such file, not in dyld cache)