# Class Diagram Extraction Pipeline

This notebook reads text documents from the target_text directory and processes them to extract class diagrams.

In [59]:
# Import required libraries
import os
import re
import pandas as pd
import time
import torch
from tqdm import tqdm
from dotenv import load_dotenv
from openai import AzureOpenAI
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from spacy import displacy
from torch.utils.data import Dataset, DataLoader

# Reading Files from target_text Directory

In [60]:
# Create output directory for PlantUML files
output_dir = os.path.join(os.path.dirname(os.getcwd()), "survey/class_diagrams_output_prosus")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")
else:
    print(f"Output directory already exists: {output_dir}")

# Define target directory path
target_dir = os.path.join(os.path.dirname(os.getcwd()), "survey/target_text")
print(f"Reading files from: {target_dir}")

# Check if directory exists
if not os.path.exists(target_dir):
    print(f"Error: Directory '{target_dir}' does not exist.")
    raise FileNotFoundError(f"Directory '{target_dir}' does not exist.")

# Get list of text files from the directory
text_files = [f for f in os.listdir(target_dir) if f.endswith('.txt')]

if not text_files:
    print("No text files found in the directory.")
else:
    print(f"Found {len(text_files)} text file(s): {text_files}")

Output directory already exists: /work/pfsa-id/survey/class_diagrams_output_prosus
Reading files from: /work/pfsa-id/survey/target_text
Found 8 text file(s): ['R26_BlockCard.txt', 'R36_Video Rental.txt', 'R81_VideoSearch.txt', 'dental-clinic.txt', 'geological-samples-observation.txt', 'law-firm.txt', 'rental-truck-company.txt', 'restaurant.txt']


In [61]:
# Initialize Azure OpenAI client
load_dotenv()

endpoint = os.getenv("AZURE_OPENAI_ENDPOINT_URL_1", "")
deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_1", "")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY_1", "")
api_version = os.getenv("AZURE_OPENAI_API_VERSION_1")

print(f"Azure OpenAI endpoint: {endpoint}")
print(f"Azure OpenAI deployment: {deployment}")

# Initialize Azure OpenAI Service client with key-based authentication
client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version,
)

Azure OpenAI endpoint: https://dewi.openai.azure.com/
Azure OpenAI deployment: gpt-4o


# Process Each Text File

In [62]:
def preprocess_document(document_text, filename):
    """Preprocess the document text and split it into sentences"""
    print(f"Processing document: {filename}")
    
    # Data preprocessing
    print("Step 1: Preprocessing")
    # Remove newlines and extra spaces
    document = re.sub(r'\n', ' ', document_text)
    document = re.sub(r'\s+', ' ', document)
    
    # Separate document by sentences into a dataframe
    document_sentence = pd.DataFrame(document.split('.'), columns=['sentence'])
    document_sentence = document_sentence[document_sentence['sentence'].str.strip().str.len() > 0].reset_index(drop=True)
    
    return document_sentence

In [63]:
# Load the pre-trained BERT model for sentence classification
def load_sentence_classifier_model():
    """Load the pre-trained BERT model for sentence classification"""
    model_dir = os.path.join(os.path.dirname(os.getcwd()), "requirement_classification", "all_model")
    
    # If multiple models exist, use the most recently created one
    model_file = "ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin"
    model_path = os.path.join(model_dir, model_file)
    print(f"Loading sentence classifier model from: {model_path}")
    
    # Define model name based on file name pattern
    model_name = "ProsusAI/finbert"  # Default model base
    
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    # Move model to GPU if available
    device = torch.device("cuda:1")
    model.to(device)
    
    return model, tokenizer, device

In [64]:
# TextDataset class for sentence classification
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    def __len__(self):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

In [65]:
def classify_sentences(document_sentence, filename, output_dir):
    """Classify sentences to identify those useful for class diagram extraction using BERT model"""
    print("Step 2: Categorizing sentences using pre-trained model")
    document_sentence['useful'] = 0
    
    # Load the pre-trained model
    try:
        model, tokenizer, device = load_sentence_classifier_model()
    except Exception as e:
        print(f"Error loading sentence classifier model: {e}")
        print("Falling back to default classification method")
        # You could implement a fallback method here
        return document_sentence
    
    # Create dataset from sentences
    sentences = document_sentence['sentence'].tolist()
    dataset = TextDataset(sentences, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)
    
    # Process batches and get predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Classifying sentences"):
            # Move inputs to device
            inputs = {key: val.to(device) for key, val in batch.items()}
            
            # Get model outputs
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Get predicted class (0 = not useful, 1 = useful)
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    
    # Update dataframe with predictions
    document_sentence['useful'] = predictions
    
    # Log results
    useful_count = sum(predictions)
    print(f"Found {useful_count} useful sentences out of {len(sentences)} total sentences")
    
    # Save checkpoint
    checkpoint_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_checkpoint.csv")
    document_sentence.to_csv(checkpoint_file)
    print(f"Saved checkpoint to {checkpoint_file}")
    
    return document_sentence

In [66]:
def extract_entities(document_sentence, filename, output_dir):
    """Extract entities from useful sentences"""
    # Extract sentences marked as useful for class diagram extraction
    print("Step 3: Extracting class diagram entities")
    sentence_class_diagram_only = document_sentence[document_sentence['useful'] == 1]
    document_class = ' '.join(sentence_class_diagram_only['sentence'].tolist())
    
    # Entity extraction using the model
    model_path = os.path.join(os.path.dirname(os.getcwd()), "key-term-extraction", "BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs")
    print(f"Using NER model from: {model_path}")
    
    try:
        ner_pipeline = pipeline("ner", model=model_path, aggregation_strategy="simple")
        entities = ner_pipeline(document_class)
        
        # Process entities
        summary = {
            "class": defaultdict(int),
            "attr": defaultdict(int)
        }
        
        for ent in entities:
            entity_type = ent["entity_group"].lower()
            word = ent["word"]
            
            if entity_type in summary:
                summary[entity_type][word] += 1
        
        # Convert defaultdict to normal dict
        summary = {key: list(value.keys()) for key, value in summary.items()}
        
        # Save entity data as CSV
        entity_csv_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.csv")
        pd.DataFrame(entities).to_csv(entity_csv_file)
        print(f"Entities saved to CSV: {entity_csv_file}")
        
        # Convert entities to JSON serializable format
        serializable_entities = []
        for ent in entities:
            # Extract only serializable properties and convert non-serializable types
            serializable_ent = {
                "entity_group": ent["entity_group"],
                "word": ent["word"],
                "score": float(ent["score"]),  # Convert tensor to float if needed
                "start": ent["start"],
                "end": ent["end"]
            }
            serializable_entities.append(serializable_ent)
        
        # Save entity data as JSON
        import json
        entity_json_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.json")
        with open(entity_json_file, 'w', encoding='utf-8') as f:
            json.dump({
                "entities": serializable_entities,  # Use serializable entities
                "summary": summary,
                "document_class": document_class
            }, f, ensure_ascii=False, indent=2)
        print(f"Entities saved to JSON: {entity_json_file}")
        
        # Save entity data as plain text
        entity_txt_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_entities.txt")
        with open(entity_txt_file, 'w', encoding='utf-8') as f:
            f.write(f"Document: {filename}\n\n")
            f.write(f"Classes identified:\n{'-'*20}\n")
            for cls in summary['class']:
                f.write(f"- {cls}\n")
            f.write(f"\nAttributes identified:\n{'-'*20}\n")
            for attr in summary['attr']:
                f.write(f"- {attr}\n")
            f.write(f"\nDetailed Entities:\n{'-'*20}\n")
            for ent in serializable_entities:  # Use serializable entities
                f.write(f"Type: {ent['entity_group']}, Word: {ent['word']}, Score: {ent['score']:.4f}\n")
        print(f"Entities saved to TXT: {entity_txt_file}")
        
        # Return serializable entities for further processing
        return {
            "entities": serializable_entities,  # Use serializable entities
            "summary": summary,
            "document_class": document_class
        }
        
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return {
            "error": str(e)
        }

In [67]:
def generate_diagram(extraction_result, filename, output_dir):
    """Generate class diagram from extracted entities"""
    print("Step 4: Generating PlantUML code")
    
    # Check if we have valid extraction results
    if "error" in extraction_result:
        print(f"Cannot generate diagram due to extraction error: {extraction_result['error']}")
        return {"filename": filename, "error": extraction_result['error']}
    
    try:
        # Prepare summary for diagram generation
        summary = extraction_result["summary"]
        document_class = extraction_result["document_class"]
        summary_string = f"class: {summary['class']}, attribute: {summary['attr']}, description: {document_class}"
        
        # Generate PlantUML using Azure OpenAI
        chat_prompt = [
            {
                "role": "system",
                "content": [{
                    "type": "text",
                    "text": "You will be given a JSON of class names, attributes, and a system description. Your task is to generate plantuml script containing classes, attributes, and relationships according to the system description. Strictly produce only plantuml script"
                }]
            },
            {
                "role": "user",
                "content": [{
                    "type": "text",
                    "text": summary_string
                }]
            }
        ]
        
        completion = client.chat.completions.create(
            model=deployment,
            messages=chat_prompt,
            max_tokens=800,
            temperature=0,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
            stream=False
        )
        plantuml_result = completion.choices[0].message.content
        
        # Clean up the result and save to file
        plantuml_result = plantuml_result.strip('```plantuml')
        plantuml_result = plantuml_result.strip('```')
        
        output_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_class_diagram.puml")
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(plantuml_result)
        print(f"PlantUML diagram saved to {output_file}")
        
        return {
            "filename": filename,
            "entities": extraction_result["entities"],
            "plantuml_code": plantuml_result,
            "output_file": output_file
        }
        
    except Exception as e:
        print(f"Error generating diagram: {e}")
        return {
            "filename": filename,
            "error": str(e)
        }

In [68]:
def process_document(document_text, filename):
    """Process a document through the entire pipeline"""
    # Step 1: Preprocess document
    document_sentence = preprocess_document(document_text, filename)
    
    # Step 2: Classify sentences
    document_sentence = classify_sentences(document_sentence, filename, output_dir)
    
    # Step 3: Extract entities
    extraction_result = extract_entities(document_sentence, filename, output_dir)

    return "true " + filename
    # # Step 4: Generate diagram
    # diagram_result = generate_diagram(extraction_result, filename, output_dir)
    
    # return diagram_result

In [69]:
# Process all files in the target directory
results = []

for filename in text_files:
    file_path = os.path.join(target_dir, filename)
    print(f"\n{'='*50}\nProcessing file: {filename}\n{'='*50}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            document_text = file.read()
            
            result = process_document(document_text, filename)
            results.append(result)
            
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        results.append({
            "filename": filename,
            "error": str(e)
        })
        
print("\nProcessing complete!")
print(f"Processed {len(results)} files.")
print(f"Output saved to: {output_dir}")


Processing file: R26_BlockCard.txt
Processing document: R26_BlockCard.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 37.31it/s]
Device set to use cuda:0


Found 17 useful sentences out of 19 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/R26_BlockCard_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/R26_BlockCard_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/R26_BlockCard_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/R26_BlockCard_entities.txt

Processing file: R36_Video Rental.txt
Processing document: R36_Video Rental.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 6/6 [00:00<00:00, 70.47it/s]
Device set to use cuda:0


Found 47 useful sentences out of 48 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/R36_Video Rental_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/R36_Video Rental_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/R36_Video Rental_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/R36_Video Rental_entities.txt

Processing file: R81_VideoSearch.txt
Processing document: R81_VideoSearch.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 2/2 [00:00<00:00, 56.82it/s]
Device set to use cuda:0


Found 16 useful sentences out of 16 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/R81_VideoSearch_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/R81_VideoSearch_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/R81_VideoSearch_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/R81_VideoSearch_entities.txt

Processing file: dental-clinic.txt
Processing document: dental-clinic.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 77.37it/s]
Device set to use cuda:0


Found 15 useful sentences out of 20 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/dental-clinic_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/dental-clinic_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/dental-clinic_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/dental-clinic_entities.txt

Processing file: geological-samples-observation.txt
Processing document: geological-samples-observation.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 3/3 [00:00<00:00, 70.45it/s]
Device set to use cuda:0


Found 16 useful sentences out of 17 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/geological-samples-observation_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/geological-samples-observation_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/geological-samples-observation_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/geological-samples-observation_entities.txt

Processing file: law-firm.txt
Processing document: law-firm.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 4/4 [00:00<00:00, 67.98it/s]
Device set to use cuda:0


Found 17 useful sentences out of 26 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/law-firm_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/law-firm_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/law-firm_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/law-firm_entities.txt

Processing file: rental-truck-company.txt
Processing document: rental-truck-company.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 4/4 [00:00<00:00, 56.92it/s]
Device set to use cuda:0


Found 25 useful sentences out of 32 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/rental-truck-company_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/rental-truck-company_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/rental-truck-company_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/rental-truck-company_entities.txt

Processing file: restaurant.txt
Processing document: restaurant.txt
Step 1: Preprocessing
Step 2: Categorizing sentences using pre-trained model
Loading sentence classifier model from: /work/pfsa-id/requirement_classification/all_model/ProsusAI-finbert_structure_focus_epochs9_kfold10_batch8.bin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Classifying sentences: 100%|██████████| 6/6 [00:00<00:00, 79.36it/s]
Device set to use cuda:0


Found 38 useful sentences out of 42 total sentences
Saved checkpoint to /work/pfsa-id/survey/class_diagrams_output_prosus/restaurant_checkpoint.csv
Step 3: Extracting class diagram entities
Using NER model from: /work/pfsa-id/key-term-extraction/BERT-Style-model/microsoft/deberta-v3-large-4-epoch-8-bs


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities saved to CSV: /work/pfsa-id/survey/class_diagrams_output_prosus/restaurant_entities.csv
Entities saved to JSON: /work/pfsa-id/survey/class_diagrams_output_prosus/restaurant_entities.json
Entities saved to TXT: /work/pfsa-id/survey/class_diagrams_output_prosus/restaurant_entities.txt

Processing complete!
Processed 8 files.
Output saved to: /work/pfsa-id/survey/class_diagrams_output_prosus


# Visualize Results

In [70]:
# Function to visualize entities for a selected file
def visualize_entities(file_index=0):
    if file_index < 0 or file_index >= len(results) or 'entities' not in results[file_index]:
        print("Invalid file index or no entities found for this file.")
        return
    
    filename = results[file_index]['filename']
    entities = results[file_index]['entities']
    
    # Read file content again
    file_path = os.path.join(target_dir, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        document_text = file.read()
    
    # Preprocess text
    document = re.sub(r'\n', ' ', document_text)
    document = re.sub(r'\s+', ' ', document)
    
    # Read checkpoint file to get useful sentences
    checkpoint_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_checkpoint.csv")
    document_sentence = pd.read_csv(checkpoint_file)
    sentence_class_diagram_only = document_sentence[document_sentence['useful'] == 1]
    document_class = ' '.join(sentence_class_diagram_only['sentence'].tolist())
    
    # Visualize entities
    colors = {
        "CLASS": "#ffff00",
        "ATTR": "#9932cc",
    }
    options = {"ents": ["CLASS", "ATTR"], "colors": colors}
    
    # Convert to spaCy format
    spacy_ents = {
        "text": document_class,
        "ents": [
            {"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]}
            for ent in entities
        ],
        "title": f"Named Entity Recognition - {filename}",
    }
    
    # Render the visualization
    displacy.render(spacy_ents, style="ent", manual=True, jupyter=True, options=options)
    
    print(f"PlantUML file: {results[file_index]['output_file']}")
    
# Example usage: visualize_entities(0)

In [None]:
# Optional: Display PlantUML code for a selected file
def show_plantuml(file_index=0):
    if file_index < 0 or file_index >= len(results) or 'plantuml_code' not in results[file_index]:
        print("Invalid file index or no PlantUML code found for this file.")
        return
    
    filename = results[file_index]['filename']
    plantuml_code = results[file_index]['plantuml_code']
    
    print(f"PlantUML code for {filename}:\n")
    print(plantuml_code)
    
# Example usage: show_plantuml(0)