# Training

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import compute_class_weight
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

## Parameters

In [2]:
DATA_SAVE_PATH = "./cleaned_data/discussion_data.csv"
SAVE_MODEL_TO_PATH = "./models/"
best_model_path = SAVE_MODEL_TO_PATH + 'best_model.pt'

#DATA_SAVE_PATH = "/kaggle/input/imap-classification/discussion_data.csv"
#SAVE_MODEL_TO_PATH = "/kaggle/working/"

PRETRAINED_MODELS = {
    'bert': 'bert-large-uncased',
    'roberta': 'roberta-base',
    'xlnet': 'xlnet-large-cased',
    'xlm': 'xlm-mlm-en-2048',
    'distilbert': 'distilbert-base-uncased',
    'albert':'albert-base-v2'
}

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
    'albert':(AlbertForSequenceClassification,AlbertTokenizer, AlbertConfig)
}

MODEL_TYPE = 'roberta'
PRETRAINED_MODEL_NAME = PRETRAINED_MODELS[MODEL_TYPE]

model_class, tokenizer_class, config_class = MODEL_CLASSES[MODEL_TYPE]

BALANCED_WEIGHTS = False

LEARNING_RATE = 1e-5
BATCH_SIZE = 32
EPOCHS = 50
WEIGHT_DECAY = 0.01
HIDDEN_SIZE = 768
DROP_OUT = 0.3

## Loading preprocessed data

In [3]:
data = pd.read_csv(DATA_SAVE_PATH)

In [4]:
X = data['X']
y = data['y']

longest_train_data = max(X, key=lambda x: len(x.split()))
print('Longest utterance length:', len(longest_train_data.split()))

num_labels = len(set([label for label in y]))

all_labels = sorted(set([label for label in y]))
print('Labels:', all_labels)

Longest utterance length: 188
Labels: ['Deliberation', 'Imaginative Entry', 'Other', 'Procedure', 'Seminar', 'Social', 'UX']


## Tokenizing and creating dataloader

In [5]:
# One hot encoder
encoder = OneHotEncoder()
encoder = encoder.fit(np.array(all_labels).reshape(-1, 1))

In [6]:
# Load the pre-trained BERT model and tokenizer
tokenizer = tokenizer_class.from_pretrained(PRETRAINED_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
max_length = min(2 ** (len(tokenizer.tokenize(longest_train_data))-1).bit_length(), 512)
print('Max chosen length:', max_length)

Max chosen length: 256


In [8]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, X, y, max_length, encoder):
        self.X = X
        self.tokenizer = tokenizer
        self.max_length = max_length
        # transform the labels into one-hot encoded format
        self.labels = encoder.transform(y)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # Encode the utterance using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            self.X[idx],
            add_special_tokens=True,
            max_length = self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            truncation=False,
            return_tensors='pt'
        )
        # Convert the list of strings into a one-hot encoded format
        label = self.labels[idx]  # This should now be a binary vector instead of a list of strings
        # Return the encoding and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label.toarray().squeeze(), dtype=torch.float),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

In [9]:
# Divide into train, validation and test sets
data = data.sample(frac=1, random_state=42)

train_size = int(0.70 * len(data))
val_size = int(0.15 * len(data))
test_size = len(data) - train_size - val_size

train_dataset = data[:train_size]
val_dataset = data[train_size:train_size+val_size]
test_dataset = data[train_size+val_size:]

# Bootstrap the data
#train_dataset = train_dataset.sample(n=len(train_dataset) * 5, replace=True, random_state=42)
#val_dataset = val_dataset.sample(n=len(val_dataset) * 5, replace=True, random_state=42)
#test_dataset = test_dataset.sample(n=len(test_dataset) * 5, replace=True, random_state=42)

train_dataset = CustomDataset(tokenizer, train_dataset['X'].values, train_dataset['y'].values.reshape(-1, 1), max_length, encoder)
val_dataset = CustomDataset(tokenizer, val_dataset['X'].values, val_dataset['y'].values.reshape(-1, 1), max_length, encoder)
test_dataset = CustomDataset(tokenizer, test_dataset['X'].values, test_dataset['y'].values.reshape(-1, 1), max_length, encoder)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [10]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Defining Model

In [11]:
def load_ckp(checkpoint_fpath, model, encoder):
    checkpoint = torch.load(checkpoint_fpath, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    encoder.set_params(**checkpoint['encoder'])
    return model, encoder

def save_ckp(state, best_model_path):
    torch.save(state, best_model_path)

In [12]:
class BERTClass(torch.nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(BERTClass, self).__init__()
        self.num_labels = num_labels
        self.l1 = model_class.from_pretrained(pretrained_model_name, num_labels=self.num_labels)
        self.pre_classifier = torch.nn.Linear(self.num_labels, HIDDEN_SIZE)
        self.dropout = torch.nn.Dropout(DROP_OUT)
        self.classifier = torch.nn.Linear(HIDDEN_SIZE, self.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = output.view(-1, self.num_labels)  # Reshape the output
        return output

model = BERTClass(PRETRAINED_MODEL_NAME, num_labels)
model = model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
if BALANCED_WEIGHTS:
    class_weights = compute_class_weight('balanced', classes=np.array(encoder.categories_).flatten(), y=y)
    weights = torch.tensor(class_weights, dtype=torch.float)
    if torch.cuda.is_available():
        weights = weights.to('cuda')
    criterium = torch.nn.CrossEntropyLoss(weight=weights)
else:
    criterium = torch.nn.CrossEntropyLoss()

def loss_fn(outputs, targets):
    return criterium(outputs, targets)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, correct_bias=False)



## Training

In [14]:
def valid(model, valid_dataloader):
    val_targets = []
    val_outputs = []
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.float)

            outputs = model(input_ids, attention_mask, token_type_ids)

            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            val_targets.extend(labels.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            

    val_loss /= len(valid_dataloader)
    
    return val_loss, val_targets, val_outputs

In [15]:
def train(model, train_dataloader):
    model.train()
    train_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.float)

        model.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)

        loss = loss_fn(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    train_loss /= len(train_dataloader)
    
    return train_loss

In [16]:
def train_model(num_epochs, train_dataloader, valid_dataloader, model, optimizer, best_model_path, patience=1):
    valid_loss_min = np.Inf

    num_not_improved = 0
    for epoch in range(1, num_epochs):
        print()
        print("#################### Epoch {}: Training Start    ####################".format(epoch))

        train_loss = train(model, train_dataloader)
        print('#################### Epoch {}: Training End      ####################'.format(epoch))

        print()
        print("#################### Epoch {}: Validation Start ####################".format(epoch))

        valid_loss, val_targets, val_outputs = valid(model, valid_dataloader)

        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(epoch, train_loss, valid_loss))

        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min, valid_loss))

            checkpoint = {
                        'state_dict': model.state_dict(),
                        'encoder' : encoder.get_params()
                    }

            save_ckp(checkpoint, best_model_path)
            valid_loss_min = valid_loss
            num_not_improved = 0
        else:
            num_not_improved += 1
            if num_not_improved >= patience:
                print('Not improvement for more than:', num_not_improved)
                break
            
        print("#################### Epoch {}: Validation End   ####################".format(epoch))
        print()

    print("#################### Training finished     ####################")
    return model


In [17]:
trained_model = train_model(EPOCHS, train_loader, val_loader, model, optimizer, best_model_path, patience=2)


#################### Epoch 1: Training Start    ####################


100%|██████████| 14/14 [00:11<00:00,  1.27it/s]


#################### Epoch 1: Training End      ####################

#################### Epoch 1: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.72it/s]


Epoch: 1 	Training Loss: 1.964259 	Validation Loss: 1.845037
Validation loss decreased (inf --> 1.845037).  Saving model ...
#################### Epoch 1: Validation End   ####################


#################### Epoch 2: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 2: Training End      ####################

#################### Epoch 2: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.82it/s]


Epoch: 2 	Training Loss: 1.930679 	Validation Loss: 1.778749
Validation loss decreased (1.845037 --> 1.778749).  Saving model ...
#################### Epoch 2: Validation End   ####################


#################### Epoch 3: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 3: Training End      ####################

#################### Epoch 3: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.84it/s]


Epoch: 3 	Training Loss: 1.844712 	Validation Loss: 1.502709
Validation loss decreased (1.778749 --> 1.502709).  Saving model ...
#################### Epoch 3: Validation End   ####################


#################### Epoch 4: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 4: Training End      ####################

#################### Epoch 4: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.84it/s]


Epoch: 4 	Training Loss: 1.560642 	Validation Loss: 1.340738
Validation loss decreased (1.502709 --> 1.340738).  Saving model ...
#################### Epoch 4: Validation End   ####################


#################### Epoch 5: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 5: Training End      ####################

#################### Epoch 5: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.84it/s]


Epoch: 5 	Training Loss: 1.395806 	Validation Loss: 1.274178
Validation loss decreased (1.340738 --> 1.274178).  Saving model ...
#################### Epoch 5: Validation End   ####################


#################### Epoch 6: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 6: Training End      ####################

#################### Epoch 6: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.82it/s]


Epoch: 6 	Training Loss: 1.213425 	Validation Loss: 1.192441
Validation loss decreased (1.274178 --> 1.192441).  Saving model ...
#################### Epoch 6: Validation End   ####################


#################### Epoch 7: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 7: Training End      ####################

#################### Epoch 7: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.81it/s]


Epoch: 7 	Training Loss: 1.029111 	Validation Loss: 1.119023
Validation loss decreased (1.192441 --> 1.119023).  Saving model ...
#################### Epoch 7: Validation End   ####################


#################### Epoch 8: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 8: Training End      ####################

#################### Epoch 8: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.82it/s]


Epoch: 8 	Training Loss: 0.870088 	Validation Loss: 1.144156
#################### Epoch 8: Validation End   ####################


#################### Epoch 9: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 9: Training End      ####################

#################### Epoch 9: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.81it/s]


Epoch: 9 	Training Loss: 0.750739 	Validation Loss: 1.025767
Validation loss decreased (1.119023 --> 1.025767).  Saving model ...
#################### Epoch 9: Validation End   ####################


#################### Epoch 10: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 10: Training End      ####################

#################### Epoch 10: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.81it/s]


Epoch: 10 	Training Loss: 0.642849 	Validation Loss: 1.057945
#################### Epoch 10: Validation End   ####################


#################### Epoch 11: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.36it/s]


#################### Epoch 11: Training End      ####################

#################### Epoch 11: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.83it/s]


Epoch: 11 	Training Loss: 0.605125 	Validation Loss: 1.007127
Validation loss decreased (1.025767 --> 1.007127).  Saving model ...
#################### Epoch 11: Validation End   ####################


#################### Epoch 12: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.36it/s]


#################### Epoch 12: Training End      ####################

#################### Epoch 12: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.81it/s]


Epoch: 12 	Training Loss: 0.466132 	Validation Loss: 1.102941
#################### Epoch 12: Validation End   ####################


#################### Epoch 13: Training Start    ####################


100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


#################### Epoch 13: Training End      ####################

#################### Epoch 13: Validation Start ####################


100%|██████████| 3/3 [00:00<00:00,  3.81it/s]

Epoch: 13 	Training Loss: 0.411825 	Validation Loss: 1.088620
Not improvement for more than: 2
#################### Training finished     ####################





In [18]:
best_model, encoder = load_ckp(best_model_path, model, encoder)

## Evaluating using test set

In [19]:
test_loss, test_labels , test_predictions_probs = valid(best_model, test_loader)

100%|██████████| 3/3 [00:00<00:00,  3.68it/s]


In [20]:
test_loss

1.0938159028689067

In [21]:
test_predictions = [ prob_list == np.max(prob_list) for prob_list in test_predictions_probs ]

In [22]:
print('Accuracy:', accuracy_score(test_labels, test_predictions))
print('Precision:', precision_score(test_labels, test_predictions, average='weighted'))
print('Recall:', recall_score(test_labels, test_predictions, average='weighted'))
print('F1:', f1_score(test_labels, test_predictions, average='weighted'))

report = classification_report(test_labels, test_predictions, target_names=np.array(encoder.categories_).flatten())
print(report)

Accuracy: 0.6236559139784946
Precision: 0.7172843577194665
Recall: 0.6236559139784946
F1: 0.6540632127728901
                   precision    recall  f1-score   support

     Deliberation       0.18      0.18      0.18        11
Imaginative Entry       0.50      0.60      0.55         5
            Other       0.33      1.00      0.50         1
        Procedure       0.31      0.57      0.40         7
          Seminar       0.95      0.72      0.82        57
           Social       0.60      0.75      0.67         8
               UX       0.14      0.25      0.18         4

        micro avg       0.62      0.62      0.62        93
        macro avg       0.43      0.58      0.47        93
     weighted avg       0.72      0.62      0.65        93
      samples avg       0.62      0.62      0.62        93



## Function to predict

In [23]:
def predict(model, tokenizer, sentence):
    model.eval()
    device = 'cuda' if cuda.is_available() else 'cpu'
    #sentence = parse(sentence)
    inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length = max_length,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            truncation=False,
            return_tensors='pt'
        )
    

    input_ids = inputs['input_ids'].to(device, dtype=torch.long)
    attention_mask = inputs['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = inputs['token_type_ids'].to(device, dtype=torch.long)

    outputs = model(input_ids, attention_mask, token_type_ids)

    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    threshold = 0.5
    outputs = test_predictions = [[prob > threshold for prob in prob_list] for prob_list in outputs ]
    
    outputs = encoder.inverse_transform(np.array(outputs))
    return sentence, outputs