In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torchcrf import CRF
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# Set device to CUDA if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Define model parameters
MODEL_NAME = "bert-base-uncased"
EPOCHS = 3
BATCH_SIZE = 8
LABEL_COLUMNS = ["structure_focus", "usecase_focus"]

# Load the data
data = pd.read_csv('type_classification-validation.csv')

# Encode labels
mlb = MultiLabelBinarizer()
data[LABEL_COLUMNS] = mlb.fit_transform(data[LABEL_COLUMNS].apply(tuple, axis=1)).tolist()

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['sentence'].tolist(), data[LABEL_COLUMNS].tolist(), test_size=0.3, random_state=42
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize the data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.float)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load pre-trained model
num_labels = len(LABEL_COLUMNS)
class BertMultiLabelCRFModel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertMultiLabelCRFModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.bert.config.hidden_size
        self.fc = torch.nn.Linear(self.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.last_hidden_state[:, 0, :])  # Take [CLS] token representation
        return logits

model = BertMultiLabelCRFModel(MODEL_NAME, num_labels)
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
def train_model(model, train_loader, optimizer, criterion, epochs=EPOCHS):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            logits = model(**inputs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))
        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# Train model
train_model(model, train_loader, optimizer, criterion, epochs=EPOCHS)

# Save model
torch.save(model.state_dict(), "./fine_tuned_bert_multilabel.pth")

def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].cpu().numpy()
            logits = model(**inputs)
            preds = torch.sigmoid(logits).cpu().numpy() > 0.5
            all_preds.extend(preds)
            all_labels.extend(labels)
    return all_labels, all_preds

# Get predictions
y_true, y_pred = evaluate_model(model, val_loader)

# Generate evaluation metrics
class_names = LABEL_COLUMNS
report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

# Compute overall precision, recall, f1-score, and accuracy
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
accuracy = accuracy_score(y_true, y_pred)

print(f"Overall Precision: {precision:.4f}")
print(f"Overall Recall: {recall:.4f}")
print(f"Overall F1-score: {f1:.4f}")
print(f"Overall Accuracy: {accuracy:.4f}")

# Save results to file
results_filename = f"results_{MODEL_NAME}_epochs{EPOCHS}_batch{BATCH_SIZE}_multilabel.txt"
with open(results_filename, "w") as f:
    f.write(report)
    f.write(f"\nOverall Precision: {precision:.4f}\n")
    f.write(f"Overall Recall: {recall:.4f}\n")
    f.write(f"Overall F1-score: {f1:.4f}\n")
    f.write(f"Overall Accuracy: {accuracy:.4f}\n")

print(f"Results saved to {results_filename}")

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Input long text to summarize
text = """
The automatic generation of UML diagrams from requirements can significantly speed up software design. 
Traditionally, this involves manual review of textual requirements to determine relevant entities and behaviors. 
Recent advances in natural language processing, particularly in fine-tuning pre-trained transformer models, 
have made it possible to automate this process with greater accuracy. 
This paper presents a pipeline that uses sentence classification to identify useful requirements 
and transform them into class and use case diagrams.
"""

# Tokenize and encode the text
inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary (you can control length via min_length, max_length)
summary_ids = model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:\n", summary)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Summary:
 The automatic generation of UML diagrams from requirements can significantly speed up software design. This paper presents a pipeline that uses sentence classification to identify useful requirements and transform them into class and use case diagrams.
