In [4]:
# cell 1: Imports and Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.optim import Adam

# cell 2: Load and Preprocess Data
df = pd.read_excel('/Users/PremGanesh/Developer/AI/CyVidia/Input_Data/Training Dataset.xlsx')
df['Requirement Description'] = df['Requirement Description'].apply(lambda x: x.strip())

area_encoder = LabelEncoder()
bucket_encoder = LabelEncoder()

df['Area_Encoded'] = area_encoder.fit_transform(df['Requirement Area (NIST)'])
df['Bucket_Encoded'] = bucket_encoder.fit_transform(df['Requirement Bucket(NIST)'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Requirement Description'], df[['Area_Encoded', 'Bucket_Encoded']], test_size=0.2
)

# cell 3: Tokenization and Dataset Preparation
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class RequirementDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels.iloc[idx].tolist()

        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length',
            return_attention_mask=True, return_tensors='pt', truncation=True
        )

        return {
            'text': text, 'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

train_dataset = RequirementDataset(train_texts, train_labels, tokenizer)
val_dataset = RequirementDataset(val_texts, val_labels, tokenizer)

# cell 4: Define the Custom RoBERTa Model
class RequirementModel(nn.Module):
    def __init__(self, num_area_labels, num_bucket_labels):
        super(RequirementModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True)
        self.area_classifier = nn.Linear(self.roberta.config.hidden_size, num_area_labels)
        self.bucket_classifier = nn.Linear(self.roberta.config.hidden_size, num_bucket_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        area_logits = self.area_classifier(pooled_output)
        bucket_logits = self.bucket_classifier(pooled_output)
        return area_logits, bucket_logits

model = RequirementModel(len(area_encoder.classes_), len(bucket_encoder.classes_))

# cell 5: Training Function
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        optimizer.zero_grad()
        area_logits, bucket_logits = model(input_ids=input_ids, attention_mask=attention_mask)
        area_loss = nn.CrossEntropyLoss()(area_logits, labels[:, 0])
        bucket_loss = nn.CrossEntropyLoss()(bucket_logits, labels[:, 1])
        loss = area_loss + bucket_loss
        total_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    return total_loss / len(data_loader)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = Adam(model.parameters(), lr=2e-5)

train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Train loss {train_loss}')

# cell 6: Validation Function
def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            area_logits, bucket_logits = model(input_ids=input_ids, attention_mask=attention_mask)
            area_loss = nn.CrossEntropyLoss()(area_logits, labels[:, 0])
            bucket_loss = nn.CrossEntropyLoss()(bucket_logits, labels[:, 1])
            loss = area_loss + bucket_loss
            total_loss += loss.item()

    return total_loss / len(data_loader)

for epoch in range(num_epochs):
    val_loss = eval_model(model, val_data_loader, device)
    print(f'Validation loss {val_loss}')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


KeyError: 1438