In [3]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, login

In [4]:
model_id = "roberta-base"
dataset_id = "ag_news"
access_token = "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"
login(access_token)

In [9]:
# Load dataset
dataset = load_dataset(dataset_id)

# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)


print("Train Dataset Columns:", train_dataset)
print("Validation Dataset Columns:", val_dataset)
print("Test Dataset Columns:", test_dataset)


# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
	return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Train Dataset Columns: Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})
Validation Dataset Columns: Dataset({
    features: ['text', 'label'],
    num_rows: 3800
})
Test Dataset Columns: Dataset({
    features: ['text', 'label'],
    num_rows: 3800
})


In [6]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [7]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})
    

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
def restructure_dataset(file_path):
	data = pd.read_json(file_path)

	structured_data = pd.DataFrame([
		{
			"Overall": item.get("Overall"),
			"RecommendHiring": item.get("RecommendHiring"),
			"StructuredAnswers": item.get("StructuredAnswers"),
			"Transcript": item.get("Transcript")
		}
		for key, item in data.items()
	])

	return structured_data

train_dataset = restructure_dataset("data/train_data.json")
val_dataset = restructure_dataset("data/val_data.json")
test_dataset = restructure_dataset("data/data.json")

print("Train Dataset Columns:", train_dataset)
print("Validation Dataset Columns:", val_dataset.columns)
print("Test Dataset Columns:", test_dataset.columns)


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizerFast

model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize_and_format(dataset, text_column="Transcript"):
	input_ids = []
	attention_masks = []
	labels = []

	for idx in range(len(dataset)):
		transcript = dataset.iloc[idx][text_column]
		label = torch.tensor([
			dataset.iloc[idx]["Overall"],
			dataset.iloc[idx]["RecommendHiring"],
			dataset.iloc[idx]["StructuredAnswers"],
		], dtype=torch.float)

		inputs = tokenizer(
			transcript,
			max_length=256,
			padding="max_length",
			truncation=True,
			return_tensors="pt"
		)

		input_ids.append(inputs["input_ids"].squeeze(0))
		attention_masks.append(inputs["attention_mask"].squeeze(0))
		labels.append(label)

	return {
		"input_ids": torch.stack(input_ids),
		"attention_mask": torch.stack(attention_masks),
		"labels": torch.stack(labels)
	}

train_data = tokenize_and_format(train_dataset, text_column="Transcript")
val_data = tokenize_and_format(val_dataset, text_column="Transcript")
test_data = tokenize_and_format(test_dataset, text_column="Transcript")

print(f"Train Dataset Size: {train_data['input_ids'].size(0)} samples")
print(f"Validation Dataset Size: {val_data['input_ids'].size(0)} samples")
print(f"Test Dataset Size: {test_data['input_ids'].size(0)} samples")


In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
	def __init__(self, data):
		self.input_ids = data["input_ids"]
		self.attention_mask = data["attention_mask"]
		self.labels = data["labels"]

	def __len__(self):
		return len(self.input_ids)

	def __getitem__(self, idx):
		return {
			"input_ids": self.input_ids[idx],
			"attention_mask": self.attention_mask[idx],
			"labels": self.labels[idx]
		}

train_dataset = CustomDataset(train_data)
val_dataset = CustomDataset(val_data)
test_dataset = CustomDataset(test_data)


print(f"Train Dataset Size: {len(train_dataset)} samples")
print(f"Validation Dataset Size: {len(val_dataset)} samples")
print(f"Test Dataset Size: {len(test_dataset)} samples")

print("Sample Labels from Train Dataset:")
for i in range(5):
	sample = train_dataset[i]
	print(f"Sample {i} Label: {sample['labels']}")



In [None]:
from torch.utils.data import DataLoader

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

for batch in train_loader:
	print(batch["input_ids"].size(), batch["attention_mask"].size(), batch["labels"].size())
	break


In [None]:

num_labels = 3
class_names = ["Overall", "RecommendHiring", "StructuredAnswers"]

# Print information about the labels
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping for interpretability (optional)
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label, "num_labels": num_labels})

print("Updated model configuration:", config)


In [None]:
# import torch
# from torch.utils.data import DataLoader
# from torch.optim import AdamW
# from tqdm import tqdm
# from transformers import RobertaForSequenceClassification, AutoConfig
# from torch.nn.functional import mse_loss
# 
# class CustomTrainer:
# 	def __init__(self, model, train_loader, val_loader, optimizer, device, max_epochs):
# 		self.model = model.to(device)
# 		self.train_loader = train_loader
# 		self.val_loader = val_loader
# 		self.optimizer = optimizer
# 		self.device = device
# 		self.max_epochs = max_epochs
# 
# 	def train(self):
# 		for epoch in range(self.max_epochs):
# 			self.model.train()
# 			train_loss = 0.0
# 			for batch in tqdm(self.train_loader, desc=f"Training Epoch {epoch+1}"):
# 				# Move inputs and labels to device
# 				input_ids = batch["input_ids"].to(self.device)
# 				attention_mask = batch["attention_mask"].to(self.device)
# 				labels = batch["labels"].to(self.device)
# 
# 				# Forward pass
# 				outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
# 				loss = outputs.loss
# 
# 				# Backward pass
# 				self.optimizer.zero_grad()
# 				loss.backward()
# 				self.optimizer.step()
# 
# 				train_loss += loss.item()
# 
# 			avg_train_loss = train_loss / len(self.train_loader)
# 			print(f"Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f}")
# 
# 			# Evaluate at the end of each epoch
# 			self.evaluate()
# 
# 	def evaluate(self):
# 		self.model.eval()
# 		val_loss = 0.0
# 		with torch.no_grad():
# 			for batch in tqdm(self.val_loader, desc="Validation"):
# 				# Move inputs and labels to device
# 				input_ids = batch["input_ids"].to(self.device)
# 				attention_mask = batch["attention_mask"].to(self.device)
# 				labels = batch["labels"].to(self.device)
# 
# 				# Forward pass
# 				outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
# 				val_loss += outputs.loss.item()
# 
# 		avg_val_loss = val_loss / len(self.val_loader)
# 		print(f"Validation Loss: {avg_val_loss:.4f}")


In [None]:
from torch.optim import AdamW
from trainer import Trainer
# Model
config = AutoConfig.from_pretrained(model_id, num_labels=3)  # Assuming 3 labels (Overall, RecommendHiring, StructuredAnswers)
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# Dataloaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Trainer
trainer = Trainer(
	model=model,
	train_loader=train_loader,
	val_loader=val_loader,
	optimizer=optimizer,
	device=device,
	max_epochs=5
)
# Fine-tune the model
trainer.train()
trainer.evaluate()

In [None]:
all_true_scores = []
all_predicted_scores = []

model.eval()
with torch.no_grad():
	for batch in test_loader:
		# Move input data and labels to the device
		inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
		labels = batch["labels"].to(device)

		# Get model predictions
		outputs = model(**inputs)

		# Assuming the outputs are logits, convert them to probabilities
		logits = outputs.logits
		probabilities = torch.softmax(logits, dim=-1)  # Adjust based on task

		# For classification, take the argmax
		predictions = torch.argmax(probabilities, dim=-1)

		# Collect true labels and predictions
		all_true_scores.extend(labels.cpu().numpy())
		all_predicted_scores.extend(predictions.cpu().numpy())

# Convert results to tensors
all_true_scores_tensor = torch.tensor(all_true_scores)
all_predicted_scores_tensor = torch.tensor(all_predicted_scores)

# Print scores
print("True Scores (Labels):", all_true_scores_tensor)
print("Predicted Scores:", all_predicted_scores_tensor)
