In [None]:
#cell 1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.optim import Adam


In [None]:
#cell 2
# Load data
df = pd.read_excel('/Users/PremGanesh/Developer/AI/CyVidia/Input_Data/Training Dataset.xlsx')

# Preprocess data
df['Requirement Description'] = df['Requirement Description'].apply(lambda x: x.strip())

# Encode labels
area_encoder = LabelEncoder()
bucket_encoder = LabelEncoder()

df['Area_Encoded'] = area_encoder.fit_transform(df['Requirement Area (NIST)'])
df['Bucket_Encoded'] = bucket_encoder.fit_transform(df['Requirement Bucket(NIST)'])

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Requirement Description'], df[['Area_Encoded', 'Bucket_Encoded']], 
    test_size=0.2
)


In [None]:
#cell 3

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class RequirementDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels.iloc[idx].tolist()

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(labels, dtype=torch.long)
        }

train_dataset = RequirementDataset(train_texts, train_labels, tokenizer)
val_dataset = RequirementDataset(val_texts, val_labels, tokenizer)


In [None]:
# cell 4: Modify the RequirementModel class
class RequirementModel(nn.Module):
    def __init__(self, num_area_labels, num_bucket_labels):
        super(RequirementModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True)
        self.area_classifier = nn.Linear(self.roberta.config.hidden_size, num_area_labels)
        self.bucket_classifier = nn.Linear(self.roberta.config.hidden_size, num_bucket_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        area_logits = self.area_classifier(pooled_output)
        bucket_logits = self.bucket_classifier(pooled_output)
        return area_logits, bucket_logits

model = RequirementModel(len(area_encoder.classes_), len(bucket_encoder.classes_))
model.to(device)