# Use Case Diagram Extraction Pipeline

This notebook reads text documents from the target_text directory and processes them to extract class diagrams.

In [1]:
# Import required libraries
import os
import re
import pandas as pd
import time
import torch
from tqdm import tqdm
from dotenv import load_dotenv
from openai import AzureOpenAI
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from spacy import displacy
from torch.utils.data import Dataset, DataLoader

# Reading Files from target_text Directory

In [2]:
# Create output directory for PlantUML files
output_dir = os.path.join(os.path.dirname(os.getcwd()), "survey-ud/usecase_diagrams_output")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")
else:
    print(f"Output directory already exists: {output_dir}")

# Define target directory path
target_dir = os.path.join(os.path.dirname(os.getcwd()), "survey-ud/target_text")
print(f"Reading files from: {target_dir}")

# Check if directory exists
if not os.path.exists(target_dir):
    print(f"Error: Directory '{target_dir}' does not exist.")
    raise FileNotFoundError(f"Directory '{target_dir}' does not exist.")

# Get list of text files from the directory
text_files = [f for f in os.listdir(target_dir) if f.endswith('.txt')]

if not text_files:
    print("No text files found in the directory.")
else:
    print(f"Found {len(text_files)} text file(s): {text_files}")

Output directory already exists: /work/pfsa-id/survey-ud/usecase_diagrams_output
Reading files from: /work/pfsa-id/survey-ud/target_text
Found 10 text file(s): ['R13_municipal-library.txt', 'R26_BlockCard.txt', 'R28_AnimalClinic.txt', 'R36_Video Rental.txt', 'R39_Insurance.txt', 'R7_SuperMarket.txt', 'R81_VideoSearch.txt', 'dental-clinic.txt', 'geological-samples-observation.txt', 'rental-truck-company.txt']


# Process Each Text File

In [3]:
def preprocess_document(document_text, filename):
    """Preprocess the document text and split it into sentences"""
    print(f"Processing document: {filename}")
    
    # Data preprocessing
    print("Step 1: Preprocessing")
    # Remove newlines and extra spaces
    document = re.sub(r'\n', ' ', document_text)
    document = re.sub(r'\s+', ' ', document)
    
    # Separate document by sentences into a dataframe
    document_sentence = pd.DataFrame(document.split('.'), columns=['sentence'])
    document_sentence = document_sentence[document_sentence['sentence'].str.strip().str.len() > 0].reset_index(drop=True)
    
    return document_sentence

In [4]:
# Load the pre-trained BERT model for sentence classification
def load_sentence_classifier_model():
    """Load the pre-trained BERT model for sentence classification"""
    model_dir = os.path.join(os.path.dirname(os.getcwd()), "requirement_classification", "all_model")
    
    # If multiple models exist, use the most recently created one
    model_file = "microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin"
    model_path = os.path.join(model_dir, model_file)
    print(f"Loading sentence classifier model from: {model_path}")
    
    # Define model name based on file name pattern
    model_name = "microsoft/deberta-v3-large"  # Default model base
    
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    # Move model to GPU if available
    device = torch.device("cuda:1")
    model.to(device)
    
    return model, tokenizer, device

In [5]:
# TextDataset class for sentence classification
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    def __len__(self):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

In [6]:
def classify_sentences(document_sentence, filename, output_dir):
    """Classify sentences to identify those useful for class diagram extraction using BERT model"""
    print("Step 2: Categorizing sentences using pre-trained model")
    document_sentence['useful'] = 0
    
    # Load the pre-trained model
    try:
        model, tokenizer, device = load_sentence_classifier_model()
    except Exception as e:
        print(f"Error loading sentence classifier model: {e}")
        print("Falling back to default classification method")
        # You could implement a fallback method here
        return document_sentence
    
    # Create dataset from sentences
    sentences = document_sentence['sentence'].tolist()
    dataset = TextDataset(sentences, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)
    
    # Process batches and get predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Classifying sentences"):
            # Move inputs to device
            inputs = {key: val.to(device) for key, val in batch.items()}
            
            # Get model outputs
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Get predicted class (0 = not useful, 1 = useful)
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    
    # Update dataframe with predictions
    # document_sentence['useful'] = predictions
    document_sentence['useful'] = 1 #override
    # Log results
    useful_count = sum(predictions)
    print(f"Found {useful_count} useful sentences out of {len(sentences)} total sentences")
    
    # Save checkpoint
    checkpoint_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_checkpoint.csv")
    document_sentence.to_csv(checkpoint_file)
    print(f"Saved checkpoint to {checkpoint_file}")
    
    return document_sentence

In [7]:
def extract_entities(document_sentence, filename, output_dir):
    """Extract entities from useful sentences"""
    # Extract sentences marked as useful for class diagram extraction
    print("Step 3: Extracting class diagram entities")
    sentence_class_diagram_only = document_sentence[document_sentence['useful'] == 1]
    document_class = ' '.join(sentence_class_diagram_only['sentence'].tolist())
    
    # Entity extraction using the model
    model_path = os.path.join(os.path.dirname(os.getcwd()), "key-term-extraction-uc", "BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt")
    print(f"Using NER model from: {model_path}")
    
    try:
        ner_pipeline = pipeline("ner", model=model_path, aggregation_strategy="simple")
        entities = ner_pipeline(document_class)
        
        # Process entities
        summary = {
            "actor": defaultdict(int),
            "usecase": defaultdict(int)
        }
        
        for ent in entities:
            entity_type = ent["entity_group"].lower()
            word = ent["word"]
            
            if entity_type in summary:
                summary[entity_type][word] += 1
        
        # Convert defaultdict to normal dict
        summary = {key: list(value.keys()) for key, value in summary.items()}
        
        # Save entity data as CSV
        entity_csv_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.csv")
        pd.DataFrame(entities).to_csv(entity_csv_file)
        print(f"Entities saved to CSV: {entity_csv_file}")
        
        # Convert entities to JSON serializable format
        serializable_entities = []
        for ent in entities:
            # Extract only serializable properties and convert non-serializable types
            serializable_ent = {
                "entity_group": ent["entity_group"],
                "word": ent["word"],
                "score": float(ent["score"]),  # Convert tensor to float if needed
                "start": ent["start"],
                "end": ent["end"]
            }
            serializable_entities.append(serializable_ent)
        
        # Save entity data as JSON
        import json
        entity_json_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.json")
        with open(entity_json_file, 'w', encoding='utf-8') as f:
            json.dump({
                "entities": serializable_entities,  # Use serializable entities
                "summary": summary,
                "document_class": document_class
            }, f, ensure_ascii=False, indent=2)
        print(f"Entities saved to JSON: {entity_json_file}")
        
        # Save entity data as plain text
        entity_txt_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.txt")
        with open(entity_txt_file, 'w', encoding='utf-8') as f:
            f.write(f"Document: {filename}\n\n")
            f.write(f"actor identified:\n{'-'*20}\n")
            for cls in summary['actor']:
                f.write(f"- {cls}\n")
            f.write(f"\n usecase identified:\n{'-'*20}\n")
            for attr in summary['usecase']:
                f.write(f"- {attr}\n")
            f.write(f"\nDetailed Entities:\n{'-'*20}\n")
            for ent in serializable_entities:  # Use serializable entities
                f.write(f"Type: {ent['entity_group']}, Word: {ent['word']}, Score: {ent['score']:.4f}\n")
        print(f"Entities saved to TXT: {entity_txt_file}")
        
        # Return serializable entities for further processing
        return {
            "entities": serializable_entities,  # Use serializable entities
            "summary": summary,
            "document_class": document_class
        }
        
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return {
            "error": str(e)
        }

In [8]:
def process_document(document_text, filename):
    """Process a document through the entire pipeline"""
    # Step 1: Preprocess document
    document_sentence = preprocess_document(document_text, filename)
    
    # Step 2: Classify sentences
    document_sentence = classify_sentences(document_sentence, filename, output_dir)
    
    # Step 3: Extract entities
    extraction_result = extract_entities(document_sentence, filename, output_dir)

    return "true " + filename

In [9]:
# Process all files in the target directory
results = []

for filename in text_files:
    file_path = os.path.join(target_dir, filename)
    print(f"\n{'='*50}\nProcessing file: {filename}\n{'='*50}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            document_text = file.read()
            
            result = process_document(document_text, filename)
            results.append(result)
            
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        results.append({
            "filename": filename,
            "error": str(e)
        })
        
print("\nProcessing complete!")
print(f"Processed {len(results)} files.")
print(f"Output saved to: {output_dir}")


Processing file: R13_municipal-library.txt
Processing document: R13_municipal-library.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00,  3.55it/s]


Found 2 useful sentences out of 17 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R13_municipal-library_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R13_municipal-library_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R13_municipal-library_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R13_municipal-library_entities.txt

Processing file: R26_BlockCard.txt
Processing document: R26_BlockCard.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00,  4.50it/s]


Found 4 useful sentences out of 19 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R26_BlockCard_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R26_BlockCard_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R26_BlockCard_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R26_BlockCard_entities.txt

Processing file: R28_AnimalClinic.txt
Processing document: R28_AnimalClinic.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 2/2 [00:00<00:00, 18.11it/s]


Found 1 useful sentences out of 15 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R28_AnimalClinic_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R28_AnimalClinic_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R28_AnimalClinic_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R28_AnimalClinic_entities.txt

Processing file: R36_Video Rental.txt
Processing document: R36_Video Rental.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 6/6 [00:00<00:00, 18.39it/s]


Found 0 useful sentences out of 48 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R36_Video Rental_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R36_Video Rental_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R36_Video Rental_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R36_Video Rental_entities.txt

Processing file: R39_Insurance.txt
Processing document: R39_Insurance.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 7/7 [00:00<00:00, 11.57it/s]


Found 5 useful sentences out of 49 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R39_Insurance_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R39_Insurance_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R39_Insurance_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R39_Insurance_entities.txt

Processing file: R7_SuperMarket.txt
Processing document: R7_SuperMarket.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 16.09it/s]


Found 8 useful sentences out of 22 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R7_SuperMarket_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R7_SuperMarket_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R7_SuperMarket_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R7_SuperMarket_entities.txt

Processing file: R81_VideoSearch.txt
Processing document: R81_VideoSearch.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 2/2 [00:00<00:00, 12.44it/s]


Found 6 useful sentences out of 16 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/R81_VideoSearch_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/R81_VideoSearch_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/R81_VideoSearch_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/R81_VideoSearch_entities.txt

Processing file: dental-clinic.txt
Processing document: dental-clinic.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 16.29it/s]


Found 6 useful sentences out of 20 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/dental-clinic_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/dental-clinic_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/dental-clinic_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/dental-clinic_entities.txt

Processing file: geological-samples-observation.txt
Processing document: geological-samples-observation.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 17.20it/s]


Found 1 useful sentences out of 17 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/geological-samples-observation_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/geological-samples-observation_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/geological-samples-observation_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/geological-samples-observation_entities.txt

Processing file: rental-truck-company.txt
Processing document: rental-truck-company.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/microsoft-deberta-v3-large_usecase_focus_epochs12_kfold10_batch8.bin


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 4/4 [00:00<00:00, 12.06it/s]


Found 8 useful sentences out of 32 total sentences
Saved checkpoint to /work/pfsa-id/survey-ud/usecase_diagrams_output/rental-truck-company_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction-uc/BERT-Style-model/microsoft/deberta-v3-large-12-epoch-4bs-1028-mt


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey-ud/usecase_diagrams_output/rental-truck-company_entities.csv
Entities saved to JSON: /work/pfsa-id/survey-ud/usecase_diagrams_output/rental-truck-company_entities.json
Entities saved to TXT: /work/pfsa-id/survey-ud/usecase_diagrams_output/rental-truck-company_entities.txt

Processing complete!
Processed 10 files.
Output saved to: /work/pfsa-id/survey-ud/usecase_diagrams_output


# Visualize Results

In [3]:
import os
import re
from dotenv import load_dotenv
from openai import AzureOpenAI

In [4]:
# Initialize Azure OpenAI client
load_dotenv()

endpoint = os.getenv("AZURE_OPENAI_ENDPOINT_URL_1", "")
deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_1", "")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY_1", "")
api_version = os.getenv("AZURE_OPENAI_API_VERSION_1")

print(f"Azure OpenAI endpoint: {endpoint}")
print(f"Azure OpenAI deployment: {deployment}")

# Initialize Azure OpenAI Service client with key-based authentication
client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version,
)

Azure OpenAI endpoint: https://dewi.openai.azure.com/
Azure OpenAI deployment: o3-mini


In [5]:
def generate_class_diagram_with_azure(actor, usecase, document_text, filename):
    """
    Generate a class diagram using Azure OpenAI by sending a request with the extracted classes, attributes,
    and document text.
    
    Args:
        classes (list): List of extracted class names
        attributes (list): List of extracted attributes
        document_text (str): The full document text used for context
        filename (str): The name of the file being processed (without extension)
    
    Returns:
        str: PlantUML code for the class diagram
    """
    # Prepare summary for diagram generation
    summary_string = f"actor: {actor}, usecase: {usecase}, description: {document_text}"
    
    mode = "gen-r-ent"
    # Create plantuml_result folder if it doesn't exist
    plantuml_result_dir = "./"+ mode + "-" + deployment

    if mode == "gen-r-ent": #with restrictions and entities
        prompt = '''Given a description of software requirement: {document_text}
                There are list of entities
                List of actor:  {actor}
                List of usecase: {usecase}

                Generate a Use Case Diagram according to the above description and these factors:
                - preferably use the actor and usecase provided, but you can also add new ones if needed
                - similar actor or usecase could be merged as one
                - discover the relationships between actor and usecase as many as possible correctly
                - only discover usecase related within the software
                Set the output strictly to only PlantUML of the result diagram 
                '''.format(document_text=document_text, actor=actor, usecase=usecase)
        
    elif mode == "gen-nr-nent": #no resctrictions and no entities
        prompt = '''Given a description of software requirement: {document_text}

                Generate a Use Case Diagram according to the above description and these factors:
                - only discover actor and usecase related within the software
                Set the output strictly to only PlantUML of the result diagram 
                '''.format(document_text=document_text)
    
    elif mode == "gen-nr-ent": #no restrictions and entities
        prompt = '''Given a description of software requirement: {document_text}
                There are list of entities
                List of actor:  {actor}
                List of usecase: {usecase}

                Generate a Use Case Diagram according to the above description
                Set the output strictly to only PlantUML of the result diagram 
                '''.format(document_text=document_text, actor=actor, usecase=usecase)

    
    chat_prompt = [
            {
                        "role": "user",
                        "content": [{
                            "type": "text",
                            "text": prompt
                }]
            },
        ]

    print(chat_prompt)
    # Use the existing client from previous cells
    completion = client.chat.completions.create(
        model=deployment,
        messages=chat_prompt,
        # max_tokens=1000,
        # temperature=0,
        # top_p=0.95,
        # frequency_penalty=0,
        # presence_penalty=0,
        # stop=None,
        # stream=False
        max_completion_tokens=100000
    )
    
    # Extract and clean up the result
    plantuml_result = completion.choices[0].message.content
    
    # Clean up any markdown code block markers
    plantuml_result = plantuml_result.strip()
    plantuml_result = plantuml_result.replace("```plantuml", "").replace("```", "").strip()
    

    if not os.path.exists(plantuml_result_dir):
        os.makedirs(plantuml_result_dir)
        print(f"Created directory: {plantuml_result_dir}")
    
    # Save PlantUML code to file
    output_file = os.path.join(plantuml_result_dir, f"{filename}_class_diagram.puml")
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(plantuml_result)
    print(f"PlantUML diagram saved to {output_file}")
    
    return plantuml_result, output_file

In [6]:
import json 
import glob
import os 

output_dir = "./usecase_diagrams_output"
# Get all JSON files from the output directory
json_files = glob.glob(os.path.join(output_dir, "*_entities.json"))
print(f"Found {len(json_files)} JSON files")

# Parse each JSON file and extract the required information
extracted_data = []

for json_file in json_files:
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        filename = os.path.basename(json_file).replace('_entities.json', '')
        
        # Extract document text, classes, and attributes
        document = data.get('document_class', '')
        actor = data.get('summary', {}).get('actor', [])
        usecase = data.get('summary', {}).get('usecase', [])
        
        generate_class_diagram_with_azure(actor, usecase, document, filename)
    except Exception as e:
        print(f"Error processing {json_file}: {e}")

print(f"Successfully extracted data from {len(extracted_data)} files")



Found 7 JSON files
[{'role': 'user', 'content': [{'type': 'text', 'text': "Given a description of software requirement: The clinic basically schedules patients, provides services for them, and bills them for those services  New patients fill out a form listing their name, address, telephone numbers, allergies, and state of mind prior to scheduling their first appointment  Existing patients are normally scheduled for their next appointment as they depart from their current appointment  When the office staff forget to do this, a desk worker has to call the patient to set up a date  Schedules are entered into a central appointment book; patient records (including contact information) are kept in paper files  Appointments are for one of three procedures: dental hygiene, cavities and fillings, and oral surgery (including root canals and tooth extractions)  For each procedure the patient needs to be prepared and supplies need to be collected (e g , probes, drill bits, cements, resins, etc ) 