In [1]:
!python -V

Python 3.11.6


In [2]:
# 6p: "Five epochs with a batch size of 32, a dropout rate of 0.1, and a learning rate of 1.5e−5 were used to fine-tune the model."
epoch_size = 5
batch_size = 32
dropout_rate = 0.1  # BertForSequenceClassification default
learning_rate = 1.5e-5

rseed = 42  # 7p: "In Table 2 and 3, 𝜇 is the average performance on three random seeds, and 𝜎 is their standard deviation."

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
import pandas as pd

# Data should not be shared publicly.
df = pd.read_excel("E:/_datasets/0-overall-level3.xlsx", header=0)

cls = 'Level3'
df = df.loc[:, ['predwbs2', 'predwbs', 'predtask', 'wbs2', 'wbs', 'name', 'sucwbs2', 'sucwbs', 'suctask', cls]]

# df.iloc[0]

In [5]:
df['text'] = df.apply(
    lambda row: f"{row['predwbs2']} > {row['predwbs']} > {row['predtask']} [pred] {row['wbs2']} > {row['wbs']} > {row['name']} [succ] {row['sucwbs2']} > {row['sucwbs']} > {row['suctask']}",
    axis=1
)

print_idx = 6538  # 'Rain Water Drainage' case in Table 4
df['text'][print_idx]

'CONSTRUCTION SUPERSTRUCTURE > Roof > Set Mechanical Equipment [pred] CONSTRUCTION SUPERSTRUCTURE > Roof > Pipe Mechanical Equipment [succ]  100 KINGSHIGHWAY > COMMISSIONNG & INSPECTIONS > STARTUP'

In [6]:
df['label'] = df.apply(
    lambda row: f"[{row[cls]}]",
    axis=1
)

df['label'][print_idx]  # D2040: 'Rain Water Drainage' in ASTM Uniformat

'[D2040]'

In [7]:
from sklearn.model_selection import train_test_split

texts = df['text']
labels = df['label']

# 6p: "This dataset is further split into training, validation, and testing using a 60-20-20 distribution."
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.4, random_state=rseed)
validation_texts, test_texts, validation_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=rseed)

(len(train_texts), len(validation_texts), len(test_texts))

(20469, 6823, 6824)

In [8]:
model_name = 'bert-base-uncased'

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
def encode(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

encode(df['text'][print_idx])['input_ids']

tensor([[  101,  2810, 28391,  1028,  4412,  1028,  2275,  6228,  3941,  1031,
          3653,  2094,  1033,  2810, 28391,  1028,  4412,  1028,  8667,  6228,
          3941,  1031, 10514,  9468,  1033,  2531,  5465,  4048,  5603,  4576,
          1028,  3222,  3070,  1004, 29589,  1028, 22752,   102]])

In [10]:
train_encodings = encode(train_texts.tolist()).to(device)
validation_encodings = encode(validation_texts.tolist()).to(device)
test_encodings = encode(test_texts.tolist()).to(device)

In [11]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_labels_encoded = torch.tensor(encoded_labels[train_texts.index], dtype=torch.long).to(device)
validation_labels_encoded = torch.tensor(encoded_labels[validation_texts.index], dtype=torch.long).to(device)
test_labels_encoded = torch.tensor(encoded_labels[test_texts.index], dtype=torch.long).to(device)

In [12]:
# attention_mask is 1 for real tokens and 0 for padding tokens
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_encoded)
validation_dataset = TensorDataset(validation_encodings['input_ids'], validation_encodings['attention_mask'], validation_labels_encoded)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_encoded)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(encoded_labels)))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1.5e-05
    maximize: False
    weight_decay: 0
)

In [16]:
from tqdm import tqdm

for epoch in range(epoch_size):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(train_loader):
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    correct_cnt = 0
    for input_ids, attention_mask, labels in validation_loader:
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        correct_cnt += torch.sum(predictions == labels).item()
    
    print(f"Validation Accuracy: {correct_cnt / len(validation_dataset)}")

100%|██████████| 640/640 [02:16<00:00,  4.69it/s]


Epoch 1 | Train Loss: 1.679827809939161
Validation Accuracy: 0.9136743368019933


100%|██████████| 640/640 [02:04<00:00,  5.16it/s]


Epoch 2 | Train Loss: 0.3578847932512872
Validation Accuracy: 0.961014216620255


100%|██████████| 640/640 [02:04<00:00,  5.15it/s]


Epoch 3 | Train Loss: 0.15450622175994794
Validation Accuracy: 0.9777224094972886


100%|██████████| 640/640 [02:04<00:00,  5.15it/s]


Epoch 4 | Train Loss: 0.08717628087179037
Validation Accuracy: 0.9822658654550784


100%|██████████| 640/640 [02:04<00:00,  5.15it/s]


Epoch 5 | Train Loss: 0.059930638829246166
Validation Accuracy: 0.9819727392642532


In [17]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()

true_labels = []
predictions = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        
        predictions.extend(pred_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

## Experimental Results

### Overall

In [18]:
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted', zero_division=0)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Test Accuracy: 0.9823
Precision: 0.9823
Recall: 0.9823
F1 Score: 0.9811


### Class-wise

In [19]:
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average=None, zero_division=0)

class_names = label_encoder.inverse_transform(list(set(true_labels)))

performance_df = pd.DataFrame({
    'Class': class_names,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})
performance_df.to_csv(f'experimental_results_rseed{rseed}.csv')
performance_df

Unnamed: 0,Class,Precision,Recall,F1 Score
0,[A1010],0.985075,0.985075,0.985075
1,[A1020],1.0,0.983607,0.991736
2,[A1030],0.8875,1.0,0.940397
3,[A2010],0.973684,1.0,0.986667
4,[A2020],0.846154,0.846154,0.846154
5,[B1010],0.998004,0.993049,0.99552
6,[B1020],1.0,0.946565,0.972549
7,[B2010],0.9875,1.0,0.993711
8,[B2020],1.0,1.0,1.0
9,[B2030],1.0,0.866667,0.928571
