# Step 1: Install and Import Python Libraries

In [1]:
# Install libraries
%pip install transformers[torch] datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Librarie for Data processing and Visualization
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

# Step 2: Download and Explore Dataset

In [3]:
# Training Data
train_df = pd.read_json('/kaggle/input/2110531-data-science-2024-01/train_for_student.json',
                        orient='index', dtype='string')
train_df.head()

Unnamed: 0,Title,Abstract,Classes
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,"['CHE', 'MATENG']"
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,['CPE']
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,['EE']
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,"['PE', 'ME', 'CHE']"
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...","['CE', 'MATSCI']"


In [4]:
# Final Testing Data for Competitive Submission
test_df = pd.read_json('/kaggle/input/2110531-data-science-2024-01/test_for_student.json', orient='index', dtype='string')
test_df.head()

Unnamed: 0,Title,Abstract
001eval,Comparative Electrical Energy Yield Performanc...,© 2013 IEEE.Long-term energy evaluation of PV ...
002eval,Effects of graphene nanoplatelets on bio-based...,© The Author(s) 2021.Novel near-infrared (NIR)...
003eval,Anti-inflammatory action of two novel peptides...,© The Royal Society of Chemistry 2020.Peanut w...
004eval,Efficient all-and-one support vector machines ...,© 2018 IEEE.We introduce a new strategy to est...
005eval,Driver identification using histogram and neur...,© 2017 IEEE.Sensor technology has continuously...


In [5]:
class_label = ['CE','ENV','BME','PE','METAL','ME','EE','CPE','OPTIC','NANO','CHE','MATENG','AGRI','EDU','IE','SAFETY','MATH','MATSCI']

# Function: Transforms a given label into a binary one-hot encoded format based on a predefined list of class labels.
def transform_label(label):
    """
    Parameters:
        label (str): A string containing abbreviations of subjects. 
              Example: "['CE', 'ENV', 'ME']"
    Returns:
        pd.Series: A pandas Series containing a binary array of length 18, where each index corresponds 
                   to whether a specific class label was present in the input string.
                   Example: For input 'CE ENV', the result would be:
                   [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    """
    all_label = re.findall(string=label, pattern=r'(\w+)')
    transformed_label = np.zeros((18,), dtype='float')
    for i in range(len(class_label)):
        for l in all_label:
            if class_label[i] == l:
                transformed_label[i] = 1
                break
    return pd.Series(transformed_label)

# Function: Transforms a data set by converting class labels into a binary one-hot encoded format based on predefined class labels.
def transform_data_set(data_set):
    """
    Parameters:
        data_set (pd.DataFrame): A pandas DataFrame where one of the columns is 'Classes'. 
                                 The 'Classes' column contains strings of abbreviations of disciplines.
                            Example:
                                 | OtherColumn | Classes      |
                                 |-------------|--------------|
                                 |     ...     | ['CE','ENV'] |
                                 |     ...     | ['ME','EE']  |

    Returns:
        pd.DataFrame: A new pandas DataFrame where the 'Classes' column is replaced with 18 binary columns, 
                      each corresponding to a specific class label. These columns will have names matching 
                      the predefined class labels.
                    Example:
                      | OtherColumn | CE | ENV | BME | PE | METAL | ME | EE | ... |
                      |-------------|----|-----|-----|----|-------|----|----|-----|
                      |   ...       |  1 |  1  |  0  | 0  |   0   | 0  | 0  | ... |
                      |   ...       |  0 |  0  |  0  | 0  |   0   | 1  | 1  | ... |
    """

    transformed_labels = data_set['Classes'].apply(transform_label)
    data_set = data_set.drop(columns=['Classes'])
    new_data_set = pd.concat([data_set, transformed_labels], axis=1)
    map_col_name = {i: class_label[i] for i in range(len(class_label))}
    new_data_set.rename(columns=map_col_name, inplace=True)
    return new_data_set

In [6]:
# Transform Training data using function defined earlier
new_train_df = transform_data_set(train_df)
display(new_train_df.head())

Unnamed: 0,Title,Abstract,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Step 3: Splitting Data Set (Train Test Split)

In [7]:
from sklearn.model_selection import train_test_split

data_set = new_train_df.copy()

# Split the data into a temporary training set (60%) and a temporary test set (40%)
y_binned = np.digitize(np.sum(data_set.loc[:, 'CE':], axis=1), bins=[1,2,3,4,5,6,7])
train_set, test_set = train_test_split(data_set, test_size=0.4, random_state=42, stratify=y_binned)

# Split the temporary test set into validation (50%) and test (50%)
y_test_binned = np.digitize(np.sum(test_set.loc[:, 'CE':], axis=1), bins=[1,2,3,4,5,6,7])
val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=42, stratify=y_test_binned)


print('Shape of train_set:', train_set.shape)
print('Shape of val_set:', val_set.shape)
print('Shape of test_set:', test_set.shape)

Shape of train_set: (272, 20)
Shape of val_set: (91, 20)
Shape of test_set: (91, 20)


# Step4: Preprocessing Data

In [8]:
# Function: Preprocess text
from nltk.corpus import stopwords
import re
import nltk
nltk.download("stopwords")

def text_preprocessing(s):
    """
    Performs text preprocessing on research paper titles and abstracts. The function cleans and normalizes the text by 
    applying several transformations aimed at preparing the text for further analysis.

    Preprocessing steps:
    - Lowercase the sentence
    - Replace contractions like "'t" with "not"
    - Remove "@name" mentions
    - Retain scientific formatting (hyphens, citations) but remove unnecessary special characters
    - Remove common stopwords, while preserving important words like "not" and "can"
    - Remove extra whitespace and trailing spaces

    Parameters:
        s (str): A string containing the text (title or abstract) that needs to be preprocessed.

    Returns:
        str: The preprocessed and cleaned text.
    """
    # Lowercase text
    s = s.lower()
    
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    
    # Remove @name mentions
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    
    # Isolate and keep parentheses (useful for citations), but remove punctuation like ! and ,
    s = re.sub(r'([\"\!\?\\\/\,])', r' ', s)
    
    # Retain scientific terms with hyphens and numbers, but remove other special characters
    s = re.sub(r'[^\w\s\-\(\)\[\]\.]', ' ', s)
    
    # Handle hyphens and parentheses properly (useful for preserving terms like "well-known" or citations)
    s = re.sub(r'\s-\s', '-', s)  # Remove spaces around hyphens
    s = re.sub(r'\s+', ' ', s).strip()  # Replace multiple spaces with a single space
    
    # Remove stopwords, but keep 'not' and 'can'
    s = " ".join([word for word in s.split() if word not in stopwords.words('english') or word in ['not', 'can']])
    
    # Remove any remaining trailing whitespaces
    return s

# Function: Preprocesses the text data in a dataset
def preprocess_data(data_set):
    """
    Preprocesses the text data in a dataset by combining title and abstract columns, applying text preprocessing,
    and keeping the relevant features.

    Parameters:
        data_set (pd.DataFrame): A pandas DataFrame that includes 'Title', 'Abstract', 
                                 and the one-hot encoded class label columns (e.g., 'CE', 'ENV', 'BME', etc.).

    Returns:
        pd.DataFrame: A processed DataFrame with the 'title_abstract' column replacing the 'Title' 
                      and 'Abstract' columns, along with the corresponding one-hot encoded class label columns.
    """

    data_set = data_set.copy()
    
    # Concatenate the 'Title' and 'Abstract' columns into a new column named 'title_abstract'.
    data_set['title_abstract'] = [title + ' ' + abstract for title, abstract in zip(data_set['Title'], data_set['Abstract'])]
    
    # Apply the 'text_preprocessing' function to the concatenated 'title_abstract' column.
    data_set['title_abstract'] = data_set['title_abstract'].apply(text_preprocessing)
    
    # Keep only the 'title_abstract' and one-hot encoded class label columns.
    data_set.drop(columns=['Title', 'Abstract'], inplace=True)
    data_set = pd.concat([data_set[['title_abstract']], data_set.loc[:, 'CE':'MATSCI']], axis=1)
    
    return data_set

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
train_set_pp = preprocess_data(train_set)
val_set_pp = preprocess_data(val_set)
test_set_pp = preprocess_data(test_set)

In [10]:
train_set_pp.head()

Unnamed: 0,title_abstract,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
304,highly active stable ni-incorporated spherical...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
86,drift-flux correlation gas-liquid two-phase fl...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
64,factors affecting mechanical properties variat...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
105,durable rechargeable zinc-air battery via self...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
394,heuristic based scheduling toothpaste filling ...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
train_set_pp.loc[304,'title_abstract']

'highly active stable ni-incorporated spherical silica catalysts co2methanation 2019 elsevier b.v.nickel-doped spherical silica (ssp) catalysts ca. 10 wt ni prepared via sol-gel method using cetyltrimethyl ammonium bromide structure directing agent different loading sequences ni si sources (si1_ni2 ni1_si2 ni_alt_si). comparison purposes ssp supported ni catalysts also prepared impregnation method (ni ssp (imp)). prepared catalysts showed spherical shape high specific surface area (357-868 m2 g). x-ray diffraction h2-temperature programmed reduction results revealed stronger interaction ni sio2 form nickel silicate ni-doped ssp catalysts except ni ssp (imp) nio species detected. reaction temperature 350 c co2 conversion order ni_alt_si (51 ) ni1_si2 (49 ) si1_ni2 (28 ) ni ssp (imp) (10 ) methane selectivity 80 95 . superior performances ni_alt_si catalyst correlated well higher electron density ni surface higher co2 adsorption ability revealed x-ray photoelectron spectroscopy co2-tempe

# Step 5: Convert Pandas Dataframe to Hugging Face Dataset

In [12]:
from datasets import Dataset

# Convert pandas dataframe to Hugging Face arrow dataset
train_set_hg = Dataset.from_pandas(train_set_pp)
val_set_hg = Dataset.from_pandas(val_set_pp)
test_set_hg = Dataset.from_pandas(test_set_pp)

In [13]:
# take a look at 'train_set_hg'
print(train_set_hg)

# Length of the Dataset
print(f'\nThe length of train_set_hg is {len(train_set_hg)}.\n')

# Check the first record of 'train_set_hg'
train_set_hg[0]

Dataset({
    features: ['title_abstract', 'CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE', 'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI', '__index_level_0__'],
    num_rows: 272
})

The length of train_set_hg is 272.



{'title_abstract': 'highly active stable ni-incorporated spherical silica catalysts co2methanation 2019 elsevier b.v.nickel-doped spherical silica (ssp) catalysts ca. 10 wt ni prepared via sol-gel method using cetyltrimethyl ammonium bromide structure directing agent different loading sequences ni si sources (si1_ni2 ni1_si2 ni_alt_si). comparison purposes ssp supported ni catalysts also prepared impregnation method (ni ssp (imp)). prepared catalysts showed spherical shape high specific surface area (357-868 m2 g). x-ray diffraction h2-temperature programmed reduction results revealed stronger interaction ni sio2 form nickel silicate ni-doped ssp catalysts except ni ssp (imp) nio species detected. reaction temperature 350 c co2 conversion order ni_alt_si (51 ) ni1_si2 (49 ) si1_ni2 (28 ) ni ssp (imp) (10 ) methane selectivity 80 95 . superior performances ni_alt_si catalyst correlated well higher electron density ni surface higher co2 adsorption ability revealed x-ray photoelectron spe

In [14]:
# generates mappings between labels and their respective indices excluding certain fields like 'title_abstract' and '__index_level_0__'
labels = [label for label in train_set_hg.features.keys() if label not in ['title_abstract', '__index_level_0__']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Step 6: Text Tokenization

In [15]:
from transformers import AutoTokenizer

# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# Take a look at the tokenizer
tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
# Funtion: tokenize hugging face dataset
def tokenize_dataset(hg_dataset):
    """
    Tokenizes and encodes a batch of text examples from a dataset, while also preparing the corresponding labels 
    for multi-label classification.

    Parameters:
        hg_dataset (dict): A hugging Face dataset, where each key is a column name (e.g., 'title_abstract', 'CE', 'ENV', etc.).
                            The "title_abstract" key contains the text data to be tokenized.
                            The remaining keys correspond to the one-hot encoded labels.

    Returns:
        dict: A dictionary containing the tokenized text with additional padding/truncation info and the labels matrix.
              The structure will include the tokenized text under keys like 'input_ids' and 'attention_mask', 
              and the labels matrix will be attached as "labels".
    """

    
    # Extract the 'title_abstract' column from the examples.
    text = hg_dataset["title_abstract"]
    
    # Tokenize the text using a tokenizer with padding and truncation, ensuring all sequences are of a fixed length (max_length=512).
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    
    # Extract the corresponding labels for each example, based on predefined label names.
    labels_batch = {k: hg_dataset[k] for k in hg_dataset.keys() if k in labels}
    
    # Create a binary matrix (shape: [batch_size, num_labels]) where each row corresponds to one text example and each column indicates whether the corresponding label is present or not.
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
        
    # Attach the label matrix to the encoded tokenized output.
    encoding["labels"] = labels_matrix.tolist()
  
    return encoding

In [17]:
# Tokenize the hugging face datasets
train_set_tokenized = train_set_hg.map(tokenize_dataset, batched=True, remove_columns=train_set_hg.column_names)
val_set_tokenized = val_set_hg.map(tokenize_dataset, batched=True, remove_columns=val_set_hg.column_names)
test_set_tokenized = test_set_hg.map(tokenize_dataset, batched=True, remove_columns=test_set_hg.column_names)

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [18]:
# Take a look at the tokenized datasets
print(train_set_tokenized)
print(val_set_tokenized)
print(test_set_tokenized)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 272
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 91
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 91
})


In [19]:
# Check the first record of 'train_set_tokenized'
example = train_set_tokenized[0]

print('Available keys:\n', example.keys())
print('\n')
print('decoded_input_id:\n', tokenizer.decode(example['input_ids']))
print('\n')
print('labels:\n', example['labels'])
print('\n')
print('mapped labels:\n', [id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0])

Available keys:
 dict_keys(['input_ids', 'attention_mask', 'labels'])


decoded_input_id:
 [CLS] highly active stable ni - incorporated spherical silica catalysts co2methanation 2019 elsevier b. v. nickel - doped spherical silica ( ssp ) catalysts ca. 10 wt ni prepared via sol - gel method using cetyltrimethyl ammonium bromide structure directing agent different loading sequences ni si sources ( si1 _ ni2 ni1 _ si2 ni _ alt _ si ). comparison purposes ssp supported ni catalysts also prepared impregnation method ( ni ssp ( imp ) ). prepared catalysts showed spherical shape high specific surface area ( 357 - 868 m2 g ). x - ray diffraction h2 - temperature programmed reduction results revealed stronger interaction ni sio2 form nickel silicate ni - doped ssp catalysts except ni ssp ( imp ) nio species detected. reaction temperature 350 c co2 conversion order ni _ alt _ si ( 51 ) ni1 _ si2 ( 49 ) si1 _ ni2 ( 28 ) ni ssp ( imp ) ( 10 ) methane selectivity 80 95. superior performances ni _ a

In [20]:
# Set format of the dataset to 'torch'
train_set_tokenized.set_format("torch")
val_set_tokenized.set_format("torch")
test_set_tokenized.set_format("torch")

# Step 7: Load Pretrained Model

In [21]:
from transformers import AutoModelForSequenceClassification

# Load pretrained model 'distilbert-base-uncased' from hugging face
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 7-1: Set Training Arguments

In [22]:
from transformers import TrainingArguments

# Set up training arguments
batch_size = 16
metric_name = "f1"

args = TrainingArguments(
    output_dir=f"distilbert-base-uncased_finetuned",
    logging_strategy='epoch',
    logging_steps=100,
    eval_strategy = "epoch",
    eval_steps=100,
    save_strategy = "epoch",
    save_steps=100,
    learning_rate=1.75e-4,
    warmup_steps=50,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=25,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    seed=42,
    max_grad_norm=1.0
)

## Step 7-2: Set Evaluation Metrics

In [23]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# Function: Calculates evaluation metrics for multi-label classification, including F1 score, ROC-AUC, and accuracy.
def multi_label_metrics(predictions, labels, threshold=0.5):
    """
    Parameters:
        predictions (np.ndarray): The predicted logits (unnormalized scores) for each label, typically of shape (batch_size, num_labels).
        labels (np.ndarray): The true labels, of the same shape as `predictions`.
        threshold (float, optional): The probability threshold used to classify the labels as 1 (positive). Defaults to 0.5.

    Returns:
        dict: A dictionary containing the following metrics:
            - 'f1': Macro-averaged F1 score.
            - 'roc_auc': Macro-averaged ROC-AUC score.
            - 'accuracy': Accuracy score.
    """
    
    # Apply a sigmoid activation to the predictions to convert them into probabilities (values between 0 and 1).
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    
    # Use a threshold (default is 0.5) to convert probabilities into binary predictions (0 or 1).
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    # Calculate and return the following metrics: F1 score (macro-averaged), ROC-AUC score (macro-averaged), and Accuracy
    y_true = labels
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
    accuracy = accuracy_score(y_true, y_pred)
    
    # return the result as dictionary
    metrics = {'f1': f1_macro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

# Function: Converts the predictions and labels from the `EvalPrediction` object into a format suitable for metric calculation
def compute_metrics(p: EvalPrediction):
    """
    Converts the predictions and labels from the `EvalPrediction` object into a format suitable for metric calculation 
    and invokes the `multi_label_metrics` function.

    Parameters:
        p (EvalPrediction): An object containing the predictions and true labels. 
                            The predictions can either be in a tuple or array format.

    Returns:
        dict: A dictionary containing the computed metrics (F1, ROC-AUC, Accuracy) based on the predictions and true labels.
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

# Step 8: Train Model Using Transformer Trainer

In [24]:
from transformers import Trainer, EarlyStoppingCallback

# Set up the trainer paprameters
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_set_tokenized,
    eval_dataset=val_set_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [25]:
# Train the model using transformer trainer
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113655288889074, max=1.0…

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6231,0.514085,0.0,0.5,0.0
2,0.4554,0.415082,0.0,0.5,0.0
3,0.3924,0.36887,0.088344,0.538373,0.087912
4,0.3507,0.344051,0.163217,0.565862,0.087912
5,0.3135,0.328679,0.29231,0.614795,0.10989
6,0.2806,0.31258,0.344213,0.631637,0.120879
7,0.2435,0.308861,0.375154,0.648455,0.120879
8,0.2067,0.303383,0.442554,0.676311,0.098901
9,0.1779,0.31055,0.399397,0.65413,0.10989
10,0.1517,0.300209,0.465895,0.693825,0.153846


TrainOutput(global_step=374, training_loss=0.17985053224997086, metrics={'train_runtime': 254.8459, 'train_samples_per_second': 26.683, 'train_steps_per_second': 1.668, 'total_flos': 792911096119296.0, 'train_loss': 0.17985053224997086, 'epoch': 22.0})

# Step 9: Evaluate Model on Test Set

In [26]:
# Take a look at test_set_tokenized
test_set_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 91
})

In [27]:
# Evaluate the model using test_set_tokenized
trainer.evaluate(test_set_tokenized)

{'eval_loss': 0.31322360038757324,
 'eval_f1': 0.5221997883197579,
 'eval_roc_auc': 0.7280671250925363,
 'eval_accuracy': 0.0989010989010989,
 'eval_runtime': 0.8692,
 'eval_samples_per_second': 104.698,
 'eval_steps_per_second': 6.903,
 'epoch': 22.0}

# Step 10: Inference on Last Test Set for Submission

In [28]:
# Take a look at final test set for submission
test_df

Unnamed: 0,Title,Abstract
001eval,Comparative Electrical Energy Yield Performanc...,© 2013 IEEE.Long-term energy evaluation of PV ...
002eval,Effects of graphene nanoplatelets on bio-based...,© The Author(s) 2021.Novel near-infrared (NIR)...
003eval,Anti-inflammatory action of two novel peptides...,© The Royal Society of Chemistry 2020.Peanut w...
004eval,Efficient all-and-one support vector machines ...,© 2018 IEEE.We introduce a new strategy to est...
005eval,Driver identification using histogram and neur...,© 2017 IEEE.Sensor technology has continuously...
...,...,...
147eval,Utilization of Sewage Sludge from Beverage Ind...,© Published under licence by IOP Publishing Lt...
148eval,Development of a Gateway for OpenADR-ECHONET L...,"© 2018 IEEE.In this paper, we develop an ECHON..."
149eval,Effect of solution treatment and precipitation...,© 2017 Elsevier Ltd. All rights reserved.The a...
150eval,An effect-analysis method for species-dependen...,"© The Authors, published by EDP Sciences, 2019..."


In [29]:
# Preprocess, transform to hugging face dataset, tokenized the final test set
last_test_set_pp = preprocess_data(test_df)
last_test_set_hg = Dataset.from_pandas(last_test_set_pp)
last_test_set_tokenized = last_test_set_hg.map(lambda x: tokenizer(x['title_abstract'], padding="max_length", truncation=True, max_length=512),
                                              batched=True, remove_columns=last_test_set_hg.column_names)
last_test_set_tokenized

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 151
})

In [30]:
# Make prediction on final test set
y_last_prediction = trainer.predict(last_test_set_tokenized)
y_last_logits = y_last_prediction.predictions
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(y_last_logits))
y_last_pred = np.zeros(probs.shape)
y_last_pred[np.where(probs >= 0.5)] = 1
y_last_pred

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [31]:
# Construct the dataframe with prediction results for submission
y_last_pred_df = pd.DataFrame(y_last_pred, columns=labels, index=test_df.index).reset_index().rename(columns={'index':'id'})
y_last_pred_df

Unnamed: 0,id,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
0,001eval,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,002eval,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,003eval,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,004eval,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,005eval,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,147eval,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
147,148eval,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,149eval,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,150eval,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Step 11: Save and Load Model

In [32]:
# Save tokenizer
tokenizer.save_pretrained('/kaggle/working/tokenizer')

# Save model
trainer.save_model('/kaggle/working/model')

In [33]:
# Load tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/tokenizer')

#laod model
loaded_model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/model')

In [45]:
# Make Prediction on the first example of last test set 

last_test_set_tokenized.set_format('torch')

output = loaded_model(
    input_ids = last_test_set_tokenized['input_ids'][0].unsqueeze(0),
    attention_mask = last_test_set_tokenized['attention_mask'][0].unsqueeze(0)
)

output

SequenceClassifierOutput(loss=None, logits=tensor([[-4.7956, -7.2263, -6.1903, -4.4394, -3.5488, -4.5475,  5.1910,  1.0098,
         -5.2391, -7.2568, -4.3371, -2.7044, -7.4255, -5.4908, -0.4915, -6.3304,
         -2.7995, -7.1076]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [55]:
# Make Prediction on the first example of last test set 
logits = output.logits
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits)
pred = np.where(probs >= 0.5, 1, 0).reshape((-1,))
mapped_pred = [id2label[i] for i in range(len(pred)) if pred[i] == 1]
mapped_pred

['EE', 'CPE']