Source: https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=StbPlIyKDP9E

In [1]:
# Installing the transformers library and additional libraries if looking process

!pip install -q transformers

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [29]:
# Importing stock ML Libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from collections import Counter
import ast

In [4]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [75]:
def Most_common_label(label_list):
    label = Counter(label_list).most_common(1)[0][0]
    inx = sort_order[label]  # Most frequent label
    L = [0 for i in range(len(unique_labels))]
    L[inx] = 1
    return L


def Soft_label(label_list):
    numeric_annotations = [sort_order[label] for label in label_list]
    return np.bincount(numeric_annotations, minlength=len(unique_labels)) / len(label_list)

In [76]:
#########
DATASET = pd.read_parquet("hf://datasets/data-is-better-together/fineweb-c/dan_Latn/train-00000-of-00001.parquet")
PROBLEMATIC_CONTENT = False
LABEL_FUNCTION = Most_common_label
#########


df = pd.DataFrame()
df["text"] = DATASET["text"]
df["educational_value_labels"] = DATASET["educational_value_labels"]
df["problematic_content_label_present"] = DATASET["problematic_content_label_present"]


# REMOVE PROBLEMATIC LABELS FROM DATASET
df = df[df['problematic_content_label_present'] == PROBLEMATIC_CONTENT]

unique_labels = df["educational_value_labels"].explode().unique().tolist()
sort_order = {
    "None": unique_labels.index("None"),
    "Minimal": unique_labels.index("Minimal"),
    "Basic": unique_labels.index("Basic"),
    "Good": unique_labels.index("Good"),
    "Excellent": unique_labels.index("Excellent"),
}

# Process Data labels
df["Final_label"] = df["educational_value_labels"].apply(LABEL_FUNCTION)

# Display sample rows
df.sample(5)

Unnamed: 0,text,educational_value_labels,problematic_content_label_present,Final_label
935,Verdens militærbyrde steg ikke sidste år!\nUSA...,"[Minimal, Minimal, Minimal]",False,"[1, 0, 0, 0, 0]"
250,Den seneste tids meget varme vejr har gjort øe...,"[Minimal, Minimal, Basic]",False,"[1, 0, 0, 0, 0]"
54,Veterandampskibet S/S Bjørn skal på værft for ...,"[Minimal, Minimal]",False,"[1, 0, 0, 0, 0]"
80,Det mener Kurt Krogsgaard fra Business Consult...,"[Minimal, None]",False,"[1, 0, 0, 0, 0]"
211,Dine kinder synes lidt slapne? De mangler en s...,"[Basic, None, None]",False,"[0, 1, 0, 0, 0]"


In [88]:
new_df = pd.DataFrame()
new_df["text"] = df["text"]
new_df["labels"] = df["Final_label"]
unique_labels

['Minimal', 'None', 'Basic', 'Excellent', 'Good']

In [78]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [83]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_SIZE = 0.8
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [90]:
# Creating the dataset and dataloader for the neural network

train_size = TRAIN_SIZE
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (806, 2)
TRAIN Dataset: (645, 2)
TEST Dataset: (161, 2)


In [92]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)