In [1]:
!python -V

Python 3.11.6


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import pandas as pd

# Data should not be shared publicly.
df = pd.read_excel("E:/_datasets/0-overall-level3.xlsx", header=0)

cls = 'Level3'
df = df.loc[:, ['predwbs2', 'predwbs', 'predtask', 'wbs2',
                'wbs', 'name', 'sucwbs2', 'sucwbs', 'suctask', cls]]
df.iloc[0]

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


predwbs2    Apollo Test Flight Interior Finish
predwbs                                Level 2
predtask          Final Appliances Connections
wbs2        Apollo Test Flight Interior Finish
wbs                                    Level 3
name              Final Appliances Connections
sucwbs2     Apollo Test Flight Interior Finish
sucwbs                                 Level 3
suctask                    Execute Final Clean
Level3                                   E1090
Name: 0, dtype: object

In [4]:
df['sentence'] = df.apply(
    lambda row: f"[CLS] {row['predwbs2']} > {row['predwbs']} > {row['predtask']} [pred] {row['wbs2']} > {row['wbs']} > {row['name']} [succ] {row['sucwbs2']} > {row['sucwbs']} > {row['suctask']} [SEP] [{row[cls]}] [SEP]",
    axis=1
)

df['sentence'][0]

'[CLS] Apollo Test Flight Interior Finish > Level 2 > Final Appliances Connections [pred] Apollo Test Flight Interior Finish > Level 3 > Final Appliances Connections [succ] Apollo Test Flight Interior Finish > Level 3 > Execute Final Clean [SEP] [E1090] [SEP]'

In [5]:
df['text'] = df.apply(
    lambda row: f"[CLS] {row['predwbs2']} > {row['predwbs']} > {row['predtask']} [pred] {row['wbs2']} > {row['wbs']} > {row['name']} [succ] {row['sucwbs2']} > {row['sucwbs']} > {row['suctask']} [SEP]",
    axis=1
)

df['text'][0]

'[CLS] Apollo Test Flight Interior Finish > Level 2 > Final Appliances Connections [pred] Apollo Test Flight Interior Finish > Level 3 > Final Appliances Connections [succ] Apollo Test Flight Interior Finish > Level 3 > Execute Final Clean [SEP]'

In [6]:
df['label'] = df.apply(
    lambda row: f"[{row[cls]}]",
    axis=1
)

df['label'][0]

'[E1090]'

In [7]:
from sklearn.model_selection import train_test_split

texts = df['text']
labels = df['label']

# train:val:test = 60:20:20
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.4, random_state=42)
validation_texts, test_texts, validation_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

(len(train_texts), len(validation_texts), len(test_texts))

(20469, 6823, 6824)

In [8]:
model_name = 'bert-base-uncased'

In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

encoded_labels = label_encoder.fit_transform(labels)

train_labels_encoded = torch.tensor(encoded_labels[train_texts.index], dtype=torch.long)
validation_labels_encoded = torch.tensor(encoded_labels[validation_texts.index], dtype=torch.long)
test_labels_encoded = torch.tensor(encoded_labels[test_texts.index], dtype=torch.long)

tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
def encode(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

train_encodings = encode(train_texts.tolist())
validation_encodings = encode(validation_texts.tolist())
test_encodings = encode(test_texts.tolist())

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_encoded)
validation_dataset = TensorDataset(validation_encodings['input_ids'], validation_encodings['attention_mask'], validation_labels_encoded)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_encoded)

In [11]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(encoded_labels)))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1.5e-5)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1.5e-05
    maximize: False
    weight_decay: 0
)

In [14]:
from tqdm import tqdm

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = [b.to(device) for b in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in validation_loader:
        batch = [b.to(device) for b in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels_ids = inputs['labels']
        
        eval_accuracy += torch.sum(predictions == labels_ids).item()
        nb_eval_steps += 1
    
    print(f"Validation Accuracy: {eval_accuracy / len(validation_dataset)}")

100%|██████████| 640/640 [02:06<00:00,  5.06it/s]


Epoch 1 | Train Loss: 1.621441763290204
Validation Accuracy: 0.9186574820460208


 75%|███████▍  | 477/640 [01:36<00:34,  4.78it/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()

true_labels = []
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = [b.to(device) for b in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=1)
        
        predictions.extend(pred_labels.cpu().numpy())
        true_labels.extend(batch[2].cpu().numpy())

Test Accuracy: 0.9174
Precision: 0.9065
Recall: 0.9174
F1 Score: 0.9047


  _warn_prf(average, modifier, msg_start, len(result))


## Experimental Results

### Overall

In [None]:
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted', zero_division=0)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Test Accuracy: 0.9174
Precision: 0.9065
Recall: 0.9174
F1 Score: 0.9047


### Class-wise

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average=None, zero_division=0)

class_names = label_encoder.inverse_transform(range(len(set(true_labels))))

performance_df = pd.DataFrame({
    'Class': class_names,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})
performance_df.to_csv('experimental_results.csv')
performance_df

Unnamed: 0,Class,Precision,Recall,F1 Score
0,[A1010],1.0,0.328358,0.494382
1,[A1020],0.425532,0.983607,0.594059
2,[A1030],0.568966,0.929577,0.705882
3,[A2010],0.904762,0.513514,0.655172
4,[A2020],0.0,0.0,0.0
5,[B1010],0.970958,0.996028,0.983333
6,[B1020],0.732919,0.900763,0.808219
7,[B2010],0.971631,0.990958,0.9812
8,[B2020],0.961538,0.996933,0.978916
9,[B2030],0.0,0.0,0.0
