# SELM Experiment 1

In this notebook, we will run an initial experiment using SELM. We will load the dataset, train the model, and evaluate its performance.

## 1. Setup
Import necessary libraries and configurations.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
import yaml

# Load configuration
def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('../config/training_config.yaml')
model_name = config['model_name']
dataset_name = config['dataset_name']
dataset_config_name = config['dataset_config_name']
max_seq_length = config['max_seq_length']
batch_size = config['batch_size']
num_epochs = config['num_epochs']

# Print configuration
print("Configuration:")
print(config)


## 2. Load Data
Load and preprocess the dataset.

In [2]:
# Load tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset(dataset_name, dataset_config_name)

# Tokenize dataset
def tokenize_fn(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=max_seq_length)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


## 3. Train the Model
Define the model, optimizer, and training loop.

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=config['num_labels'])
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}')


## 4. Evaluate the Model
Evaluate the model's performance on the test set.

In [4]:
from sklearn.metrics import accuracy_score

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy:.4f}')


## 5. Save the Model
Save the trained model and tokenizer for future use.

In [5]:
model.save_pretrained('model_output/trained_model/')
tokenizer.save_pretrained('model_output/trained_model/')
print('Model and tokenizer saved to model_output/trained_model/')
