## Import Libraries

In [None]:
import os
import pandas as pd
import pickle
import tensorflow as tf
import torch
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import trange

# !pip install transformers

## Set Parameters
These are the parameters that have to be set before starting the training

In [None]:
# Select the pre-trained model that is finetuned
BASE_MODEL = "distilbert-base-german-cased"
# BASE_MODEL = 'dbmdz/bert-base-german-cased'
# BASE_MODEL = 'bert-base-uncased'

# Model and data are stored in the directory <timestamp>_<MODEL_DIR_NAME_SUFFIX>
MODEL_DIR_NAME_SUFFIX = "MULT_DistilBERT"

# Specify the csv file that contains the training data
CSV_NAME = "labeled_messages_ground_truth_condensed.csv"
# CSV_NAME = "labeled_messages_ground_truth.csv"

# Max number of tokens that a message can be represented by.
# Due to memory constraints this may be restricted to somewhere between 100 and 300
# Keep in mind that this number may limit the portion of a message that is encoded
MAX_TOKEN_LENGTH = 280

# What portion of the dataset to allocate for testing
TEST_SET_PROPORTION = 0.1

## Find Computing device

In [None]:
# device_name = tf.test.gpu_device_name()
# if device_name != "/device:GPU:0":
#     raise SystemError("GPU device not found")
# print("Found GPU at: {}".format(device_name))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

## Load and Preprocess Training Data

The dataset will be tokenized then split into training and validation sets. 

The validation set will be used to monitor training. The test set can be used for evaluation.

In [None]:
df_messages = pd.read_csv(CSV_NAME)
df_messages = df_messages.sample(frac=1).reset_index(drop=True)  # shuffle rows
print("The dataset contains", len(df_messages), "labeled messages")
df_messages.head()

In [None]:
BASE_COLUMNS_COUNT = 2
TOPICS_COUNT = len(df_messages.columns) - BASE_COLUMNS_COUNT

base_columns = list(df_messages.columns[:BASE_COLUMNS_COUNT])
misconception_columns = list(df_messages.columns[BASE_COLUMNS_COUNT:])

In [None]:
misconception_counts = df_messages[misconception_columns].sum()
print("\nMisconception Counts:")
print(misconception_counts)

In [None]:
df_messages["one_hot_labels"] = list(
    df_messages[misconception_columns].astype("int").values
)
df_messages.head()

In [None]:
# Split data into train and test set
test_set_size = len(df_messages) * TEST_SET_PROPORTION

label_combination_counts = df_messages.one_hot_labels.astype(str).value_counts()
label_combinations_with_freq_one = label_combination_counts[
    label_combination_counts == 1
].keys()
label_combinations_with_freq_one_indices = list(
    df_messages[
        df_messages.one_hot_labels.astype(str).isin(label_combinations_with_freq_one)
    ].index
)
print(
    "df_messages label indices with only one instance: ",
    label_combinations_with_freq_one_indices,
)

df_train = pd.DataFrame(columns=df_messages.columns)
df_test = pd.DataFrame(columns=df_messages.columns)
for index, row in df_messages.iterrows():
    if index in label_combinations_with_freq_one_indices:
        df_train = df_train.append(df_messages.iloc[index])
    else:
        if len(df_test) < test_set_size:
            df_test = df_test.append(df_messages.iloc[index])
        else:
            df_train = df_train.append(df_messages.iloc[index])

df_train.to_csv("train.csv", index=False)
df_test.to_csv("test.csv", index=False)
print("Created train set of size", len(df_train), "and test set of size", len(df_test))

In [None]:
one_hot_labels = list(df_train.one_hot_labels.values)
messages_list = list(df_train.content.values)

Load the pretrained tokenizer that corresponds to your choice in model. e.g.,

```
BERT:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 

XLNet:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=False) 

RoBERTa:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
```


In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained(BASE_MODEL) # tokenizer
encodings = tokenizer.__call__(messages_list, max_length=MAX_TOKEN_LENGTH, padding=True, 
    truncation=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

In [None]:
input_ids = encodings["input_ids"]  # tokenized and encoded sentences
attention_masks = encodings["attention_mask"]  # attention masks

Be sure to handle all classes during validation using "stratify" during train/validation split:

In [None]:
# Use train_test_split to split our data into train and validation sets

(
    train_inputs,
    validation_inputs,
    train_labels,
    validation_labels,
    train_masks,
    validation_masks,
) = train_test_split(
    input_ids, one_hot_labels, attention_masks, random_state=2020, test_size=0.10
)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a 
# batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during 
# training because, unlike a for loop, with an iterator the entire dataset does not need 
# to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(
    validation_data, sampler=validation_sampler, batch_size=batch_size
)

## Load Model & Set Params

Load the appropriate model below, each model already contains a single dense layer for classification on top.



```
BERT:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=target_labels_count)

XLNet:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=target_labels_count)

RoBERTa:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=target_labels_count)
```



In [None]:
import pickle
from transformers import AutoModelForSequenceClassification

target_labels_count = len(misconception_columns)
# Load model, the pretrained model will include a single linear classification layer on 
# top for classification.
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=target_labels_count,
    # id2label={i: label for i, label in enumerate(misconception_columns)},
    # label2id={label: i for i, label in enumerate(misconception_columns)},
)
model.cuda()

id2label = dict(zip(range(len(misconception_columns)), misconception_columns))
label2id = dict(zip(misconception_columns, range(len(misconception_columns))))
pickle.dump(id2label, open("id2label.p", "wb"))
pickle.dump(label2id, open("label2id.p", "wb"))

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [None]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.01,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay_rate": 0.0,
    },
]

In [None]:
from transformers import AdamW

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0  # running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss()
        loss = loss_func(
            logits.view(-1, target_labels_count),
            b_labels.type_as(logits).view(-1, target_labels_count),
        )  # convert labels to float for calculation
        # loss_func = BCELoss()
        # loss = loss_func(torch.sigmoid(
        #     logits.view(-1, target_labels_count)), 
        #     b_labels.type_as(logits).view(-1, target_labels_count)
        # ) #convert labels to float for calculation
        train_loss_set.append(loss.item())

        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    ###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Variables to gather full output
    logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

    # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to("cpu").numpy()
            b_labels = b_labels.to("cpu").numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.30
    pred_bools = [pl > threshold for pl in pred_labels]
    true_bools = [tl == 1 for tl in true_labels]
    validation_f1_score = f1_score(true_bools, pred_bools, average="micro")
    validation_accuracy_score = accuracy_score(true_bools, pred_bools)

    print("\nTrain loss: {}".format(tr_loss / nb_tr_steps))
    print("F1-Score: ", validation_f1_score)
    print("Accuracy: ", validation_accuracy_score)

## Store Model

In [None]:
import shutil
from datetime import datetime

# Create model directory
model_dir_name = datetime.now().strftime("%Y-%m-%dT%H-%M-%S_") + MODEL_DIR_NAME_SUFFIX
os.mkdir(f"./{model_dir_name}/")
# Store trained model
model.save_pretrained("model")
# Move relevant files to model directory
shutil.move("./model/", f"./{model_dir_name}/")
shutil.move(f"./{CSV_NAME}", f"./{model_dir_name}/{CSV_NAME}")
shutil.move("./train.csv", f"./{model_dir_name}/train.csv")
shutil.move("./test.csv", f"./{model_dir_name}/test.csv")
shutil.move("./id2label.p", f"./{model_dir_name}/id2label.p")
shutil.move("./label2id.p", f"./{model_dir_name}/label2id.p")

In [None]:
# For when executing the notebook in Google Colab

# directory_to_store = "/content/" + model_dir_name
# shutil.make_archive(model_dir_name, "zip", directory_to_store)

# After mounting your Google Drive, move the files to drive/MyDrive using drag and drop
# from google.colab import drive
# drive.mount('/content/drive')