In [35]:
import torch
import pickle
import os
import json
import numpy as np
from torch import nn
import torch.optim as optim
from transformers import *
from tqdm import trange
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from util_loss import ResampleLoss
from transformers import AutoModelForSequenceClassification, AutoTokenizer,BertConfig
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder

In [2]:
class Config:
    def __init__(self):

        self.path_base='/nfs/SSD-Data/winworkspace/HacktonMDA/'
        self.SEED =42
        self.trainset=os.path.join(self.path_base,'MDA---Hackathon','training_dict_219_new.pickle')
        self.out_trainset= os.path.join(self.path_base,'dataset','Training_dataset.xlsx')
        self.source_dir = os.path.join(self.path_base,'Model')
        self.prefix = 'pubmed' 
        self.suffix = 'gt2020.rand123'
        self.model_name = 'biobert_base'
        self.loss_func_name = 'BCE'

        self.max_len = 512
        self.lr = 4e-4
        self.epochs = 50
        self.batch_size = 32
    
    def __getattribute__(self, name):
        return object.__getattribute__(self, name)
    
config=Config()
        

In [3]:
df =pd.read_excel(config.out_trainset)
df

Unnamed: 0,Year,Quarter,Filename,Report_quality,ภาพรวม,สภาวะตลาด,สรุปการผลการดำเนินงาน,เหตุการณ์สำคัญ,ปัจจัยต่อการเติบโต,เปิดเผยรายได้พร้อมคำอธิบาย,เปิดเผยกำไรที่เปลี่ยนไปพร้อมคำอธิบาย,เปิดเผยรายได้,เปิดเผยกำไรที่เปลี่ยนไป,วิเคราะห์งบหรือฐานะทางการเงิน,รายงาน ESG
0,2021,1,ADVANC_2021048695,A,Y,Y,Y,Y,Y,Y,Y,,,Y,Y
1,2021,1,2S_2021057723,C,N,N,N,N,N,,,Y,Y,N,N
2,2021,1,3BBIF_2021052508,B,N,N,Y,Y,N,Y,Y,,,Y,N
3,2021,1,AMC_2021059653,B,N,N,Y,Y,N,Y,Y,,,Y,N
4,2021,1,BAM_2021055803,B,N,N,Y,Y,N,Y,Y,,,Y,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,2023,3,DCON_84847201,B,Y,N,N,N,N,Y,Y,,,N,N
216,2023,3,DRT_84296701,A,Y,Y,Y,Y,Y,Y,Y,,,Y,Y
217,2023,3,GBX_84528501,C,N,N,N,N,N,Y,N,,,N,N
218,2023,3,KKC_84909601,C,Y,N,N,N,N,,,Y,N,Y,N


In [4]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [5]:
SEED = config.SEED  
set_seed(SEED)

In [6]:
########## Configuration Part 1 ###########
source_dir = './'
prefix = config.prefix
suffix = config.suffix
model_name = config.model_name
loss_func_name = config.loss_func_name 

if model_name == 'biobert_base':
    # model_checkpoint = os.path.join(source_dir, 'Model', 'biobert-base-cased-v1.1.bin')
    model_checkpoint = 'dmis-lab/biobert-base-cased-v1.1'

    
data_train=pickle.load(open(config.trainset,'rb'))

excluded_columns = ['Year', 'Quarter', 'Filename']
labels_ref = [col for col in df.columns if col not in excluded_columns]

class_freq = []
for label in labels_ref:
    value_counts = df[label].value_counts(normalize=True).reindex([0, 1], fill_value=0.0)
    label_freq = value_counts.tolist()  # Convert to a list
    class_freq.append(label_freq)

# Convert the list to a numpy array with the correct data type
class_freq = np.array(class_freq, dtype=np.float32)


# Convert to a numpy array of proper type
class_freq = np.array(class_freq, dtype=np.float32)


train_num=len(df)
num_labels = len(labels_ref)

max_len = config.max_len
lr = config.lr
epochs = config.epochs
batch_size = config.batch_size




In [7]:
# ########## set up ###########
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_len=max_len)
# tokenizer.save_pretrained("./biobert-tokenizer")
tokenizer = AutoTokenizer.from_pretrained("./biobert-tokenizer")



model_config = BertConfig.from_pretrained(model_checkpoint, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    config=model_config,
    ignore_mismatched_sizes=True  
)
model = nn.DataParallel(model).to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = optim.AdamW(optimizer_grouped_parameters, lr=lr)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file config.json from cache at /home/thanawin/.cache/huggingface/hub/models--dmis-lab--biobert-base-cased-v1.1/snapshots/924f12e0c3db7f156a765ad53fb6b11e7afedbc8/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_

In [8]:
# ########## Configuration Part 2 ###########

if loss_func_name == 'BCE':
    loss_func = ResampleLoss(reweight_func=None, loss_weight=1.0,
                             focal=dict(focal=False, alpha=0.5, gamma=2),
                             logit_reg=dict(),
                             class_freq=class_freq, train_num=train_num)
    
if loss_func_name == 'FL':
    loss_func = ResampleLoss(reweight_func=None, loss_weight=1.0,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(),
                             class_freq=class_freq, train_num=train_num) 
    
if loss_func_name == 'CBloss': #CB
    loss_func = ResampleLoss(reweight_func='CB', loss_weight=5.0,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(),
                             CB_loss=dict(CB_beta=0.9, CB_mode='by_class'),
                             class_freq=class_freq, train_num=train_num) 
    
if loss_func_name == 'R-BCE-Focal': # R-FL
    loss_func = ResampleLoss(reweight_func='rebalance', loss_weight=1.0,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(),
                             map_param=dict(alpha=0.1, beta=10.0, gamma=0.05), 
                             class_freq=class_freq, train_num=train_num)
    
if loss_func_name == 'NTR-Focal': # NTR-FL
    loss_func = ResampleLoss(reweight_func=None, loss_weight=0.5,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(init_bias=0.05, neg_scale=2.0),
                             class_freq=class_freq, train_num=train_num)  

if loss_func_name == 'DBloss-noFocal': # DB-0FL
    loss_func = ResampleLoss(reweight_func='rebalance', loss_weight=0.5,
                             focal=dict(focal=False, alpha=0.5, gamma=2),
                             logit_reg=dict(init_bias=0.05, neg_scale=2.0),
                             map_param=dict(alpha=0.1, beta=10.0, gamma=0.05), 
                             class_freq=class_freq, train_num=train_num)
    
if loss_func_name == 'CBloss-ntr': # CB-NTR
    loss_func = ResampleLoss(reweight_func='CB', loss_weight=10.0,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(init_bias=0.05, neg_scale=2.0),
                             CB_loss=dict(CB_beta=0.9, CB_mode='by_class'),
                             class_freq=class_freq, train_num=train_num) 
    
if loss_func_name == 'DBloss': # DB
    loss_func = ResampleLoss(reweight_func='rebalance', loss_weight=1.0,
                             focal=dict(focal=True, alpha=0.5, gamma=2),
                             logit_reg=dict(init_bias=0.05, neg_scale=2.0),
                             map_param=dict(alpha=0.1, beta=10.0, gamma=0.05), 
                             class_freq=class_freq, train_num=train_num)

        


In [None]:
########## data preprocessing (one-off configuration based on the input data) ###########

model_encode = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def EncodingText(model,sentence):
     embeddings = model.encode([sentence])
     return embeddings

class CustomDataset(Dataset):
    def __init__(self, excel_file, data_dict, target_columns=None):
        self.data = pd.read_excel(excel_file)
        self.target_columns = target_columns or [col for col in self.data.columns 
                                                  if col not in ["Year", "Quarter", "Filename"]]
        self.text_dict = data_dict
        
        
        self.label_encoders = {col: LabelEncoder().fit(self.data[col].fillna('0')) for col in self.target_columns}
        with open("label_encoders.pkl", "wb") as f:
            pickle.dump(self.label_encoders, f)
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
     
        filename = row["Filename"]
        paragraph_text = self.text_dict.get(filename, None)

        if not paragraph_text or not isinstance(paragraph_text, str):
            paragraph_text = "Unknown text"
        
        text_vector = EncodingText(model_encode, paragraph_text)
        
        
        targets = {}
        for col in self.target_columns:
            value = row[col]
            value = '0' if pd.isna(value) else value 
            targets[col] = self.label_encoders[col].transform([value])[0] 
        targets_tensor = torch.tensor(list(targets.values()), dtype=torch.float32)
        
        return text_vector, targets_tensor


    
########## start training + val ###########
keys = list(data_train.keys())
train_keys, val_keys = train_test_split(keys, test_size=0.2, random_state=42)
train_docs = {key: data_train[key] for key in train_keys}
val_docs = {key: data_train[key] for key in val_keys}

train_dataloader = DataLoader(CustomDataset(config.out_trainset,train_docs), shuffle=True, batch_size=batch_size)
validation_dataloader = DataLoader(CustomDataset(config.out_trainset,val_docs), shuffle=False, batch_size=batch_size)



loading configuration file config.json from cache at /home/thanawin/.cache/huggingface/hub/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2/snapshots/8d6b950845285729817bf8e1af1861502c2fed0c/config.json
Model config BertConfig {
  "_name_or_path": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250037
}

loading weights file model.safetensors from cache at /home/thanawin/.cache/hugg

In [40]:
for i, (text_vector, targets) in enumerate(train_dataloader):
    print(f"Batch {i + 1}:")
    print("Text Vector (Embeddings):", text_vector)
    print("Targets:", targets)
    print("\n")
    if i >= 1:
        break


Batch 1:
Text Vector (Embeddings): tensor([[[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]],

        [[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]],

        [[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]],

        ...,

        [[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]],

        [[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]],

        [[-0.2214, -0.1492, -0.0243,  ..., -0.1206, -0.0379,  0.1480]]])
Targets: tensor([[2., 0., 1., 0., 0., 0., 0., 0., 2., 2., 0., 0.],
        [0., 1., 2., 1., 1., 1., 2., 2., 0., 0., 1., 1.],
        [0., 1., 2., 1., 1., 1., 2., 2., 0., 0., 1., 1.],
        [1., 1., 2., 1., 1., 1., 2., 2., 0., 0., 1., 0.],
        [2., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0.],
        [2., 0., 1., 0., 0., 0., 0., 0., 2., 1., 0., 0.],
        [1., 0., 1., 1., 0., 0., 2., 2., 0., 0., 1., 0.],
        [2., 0., 1., 0., 0., 0., 0., 0., 2., 2., 0., 0.],
        [1., 1., 2., 1., 0., 0., 2., 2.,

In [10]:
# best_f1_for_epoch = 0
# epochs_without_improvement = 0

# for epoch in trange(epochs, desc="Epoch"):
#     # Training
#     model.train()
#     tr_loss = 0
#     nb_tr_steps = 0
  
#     for _, batch in enumerate(train_dataloader):
#         batch = tuple(t.to(device) for t in batch)
#         b_input_ids, b_input_mask, b_labels = batch
#         optimizer.zero_grad()

#         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
#         logits = outputs[0]
#         loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels))
#         loss.backward()
#         optimizer.step()
#         tr_loss += loss.item()
#         nb_tr_steps += 1

#     print("Train loss: {}".format(tr_loss/nb_tr_steps))

#     # Validation
#     model.eval()
#     val_loss = 0
#     nb_val_steps = 0
#     true_labels,pred_labels = [],[]
    
#     for _, batch in enumerate(validation_dataloader):
#         batch = tuple(t.to(device) for t in batch)
#         b_input_ids, b_input_mask, b_labels = batch
#         with torch.no_grad():
#             # Forward pass
#             outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
#             b_logit_pred = outs[0]
#             pred_label = torch.sigmoid(b_logit_pred)
#             loss = loss_func(b_logit_pred.view(-1,num_labels),b_labels.type_as(b_logit_pred).view(-1,num_labels))
#             val_loss += loss.item()
#             nb_val_steps += 1
    
#             b_logit_pred = b_logit_pred.detach().cpu().numpy()
#             pred_label = pred_label.to('cpu').numpy()
#             b_labels = b_labels.to('cpu').numpy()

#         true_labels.append(b_labels)
#         pred_labels.append(pred_label)
    
#     print("Validation loss: {}".format(val_loss/nb_val_steps))

#     # Flatten outputs
#     true_labels = [item for sublist in true_labels for item in sublist]
#     pred_labels = [item for sublist in pred_labels for item in sublist]

#     # Calculate Accuracy
#     threshold = 0.5
#     true_bools = [tl==1 for tl in true_labels]
#     pred_bools = [pl>threshold for pl in pred_labels]
#     val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
#     val_precision_accuracy = precision_score(true_bools, pred_bools,average='micro')
#     val_recall_accuracy = recall_score(true_bools, pred_bools,average='micro')
    
#     print('F1 Validation Accuracy: ', val_f1_accuracy)
#     print('Precision Validation Accuracy: ', val_precision_accuracy)
#     print('Recall Validation Accuracy: ', val_recall_accuracy)

#     # Calculate AUC as well
#     val_auc_score = roc_auc_score(true_bools, pred_labels, average='micro')
#     print('AUC Validation: ', val_auc_score)
    
#     # Search best threshold for F1
#     best_med_th = 0.5
#     micro_thresholds = (np.array(range(-10,11))/100)+best_med_th
#     f1_results, prec_results, recall_results = [], [], []
#     for th in micro_thresholds:
#         pred_bools = [pl>th for pl in pred_labels]
#         test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
#         test_precision_accuracy = precision_score(true_bools, pred_bools,average='micro')
#         test_recall_accuracy = recall_score(true_bools, pred_bools,average='micro')
#         f1_results.append(test_f1_accuracy)
#         prec_results.append(test_precision_accuracy)
#         recall_results.append(test_recall_accuracy)

#     best_f1_idx = np.argmax(f1_results) #best threshold value

#     # Print and save classification report
#     print('Best Threshold: ', micro_thresholds[best_f1_idx])
#     print('Test F1 Accuracy: ', f1_results[best_f1_idx])

#     # Save the model if this epoch gives the best f1 score in validation set
#     if f1_results[best_f1_idx] > (best_f1_for_epoch * 0.995):
#         best_f1_for_epoch = f1_results[best_f1_idx]
#         epochs_without_improvement = 0
#         model_dir = os.path.join(source_dir, 'models')
#         for fname in os.listdir(model_dir):
#             if fname.startswith('_'.join([prefix,model_name,loss_func_name,suffix])):
#                 os.remove(os.path.join(model_dir, fname))
#         torch.save(model.state_dict(), os.path.join(model_dir, '_'.join([prefix,model_name,loss_func_name,suffix,'epoch'])+str(epoch+1)+'para'))
#     else:
#         epochs_without_improvement += 1
    
#     log_dir = os.path.join(source_dir, 'logs')
#     # Log all results in validation set with different thresholds
#     with open(os.path.join(log_dir, '_'.join([prefix,model_name,loss_func_name,suffix,'epoch'])+str(epoch+1)+'.json'),'w') as f:
#         d = {}
#         d["f1_accuracy_default"] =  val_f1_accuracy
#         d["pr_accuracy_default"] =  val_precision_accuracy
#         d["rec_accuracy_default"] =  val_recall_accuracy
#         d["auc_score_default"] =  val_auc_score
#         d["thresholds"] =  list(micro_thresholds)
#         d["threshold_f1s"] =  f1_results
#         d["threshold_precs"] =  prec_results
#         d["threshold_recalls"] =  recall_results
#         json.dump(d, f)
    
#     open(os.path.join(log_dir, '_'.join([prefix,model_name,loss_func_name,suffix,'epoch'])+str(epoch+1)+'.tmp'),'w').write('%s %s' % (micro_thresholds[best_f1_idx], f1_results[best_f1_idx]))

#     # If 5 epochs pass without improvement consider the model as saturated and exit
#     if epochs_without_improvement > 4:
#         break