In [1]:
import json
import argparse
import torch
import sys
import random
import gc
import os
import ast
import math
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path
 
from transformers import (AutoTokenizer, Trainer, TrainingArguments,
                          AutoModelForTokenClassification, DataCollatorForTokenClassification,
                          LongformerConfig, LongformerForTokenClassification, BitsAndBytesConfig)
from datasets import Dataset, features
from typing import Iterable, Any, Callable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import peft
from peft import get_peft_config, get_peft_model, PeftModel, PeftConfig, LoraConfig, TaskType
from seqeval.metrics import recall_score, precision_score, f1_score

2024-04-21 09:02:31.158674: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

SEED = 42
seed_everything(SEED)

In [3]:
import ctypes
libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

In [4]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


In [5]:
# Function to replace labels
def replace_labels(row, labels):
    """
    Replace labels in a row with 'O' if not in provided labels list.
    """
    return [label if label in labels else 'O' for label in row]

# Function to load data
def load_data(labels):
    """
    Load and preprocess data from multiple sources.
    """
    # Load data from files
    df_werner = pd.read_csv(f"extra-data/individual_labels/email_new.csv")
    df_werner.rename(columns={'text': "full_text"}, inplace=True)
    df_werner["tokens"] = df_werner["tokens"].apply(ast.literal_eval)
    df_werner["trailing_whitespace"] = df_werner["trailing_whitespace"].apply(ast.literal_eval)
    df_werner["labels"] = df_werner["labels"].apply(ast.literal_eval)

    df_mixtral = pd.read_json("extra-data/mixtral-8x7b-v1.json")

    train_data = pd.read_json("pii-detection-removal-from-educational-data/train.json")

    gemma_df = pd.read_json("extra-data/pii_dataset_Gemma.json")

    df_mpware = json.load(open('extra-data/mpware_mixtral8x7b_v1.1-no-i-username.json'))
    df_mpware = pd.DataFrame(df_mpware)
    df_mpware = df_mpware[train_data.columns]

    df_pj = pd.read_json('extra-data/moredata_dataset_fixed.json')

    df_moth = pd.read_json('extra-data/pii_dataset_fixed.json')
    df_moth.rename(columns={'text': "full_text"}, inplace=True)

    # Combine dataframes
    df = pd.concat([train_data, gemma_df, df_mpware, df_pj, df_moth, df_werner, df_mixtral])
    df['document'] = [i for i in range(len(df))]  # Update the document
    df.reset_index(drop=True, inplace=True)
    df['labels'] = df['labels'].apply(replace_labels, args=(labels,))

    # Get unique labels
    all_labels = sorted(np.unique(functools.reduce(lambda a, b: list(np.unique(a+b)), df['labels'].tolist())))

    # Create label indexes
    label2id = {label: index for index, label in enumerate(all_labels)}
    id2label = {index: label for index, label in enumerate(all_labels)}
    return df, all_labels, label2id, id2label

# Function to encode labels
def encode_labels(df):
    """
    Encode labels to one-hot format.
    """
    total = len(df)
    df["unique_labels"] = df["labels"].apply(lambda labels: list(set([label.split('-')[1] for label in labels if label != 'O'])))
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df['unique_labels'])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    df = pd.concat([df, one_hot_df], axis=1)
    df['others'] = df['unique_labels'].apply(lambda x: 1 if len(x) == 0 else 0)
    label_classes = list(mlb.classes_) + ['others']
    for col in label_classes:
        subtotal = df[col].sum()
        percent = subtotal / total * 100
        print(f'{col}: {subtotal}  ({percent:.1f}%)')
    return df, label_classes, subtotal

In [6]:
# Choose appropriate target columns or all columns as required

# target = [
#     'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
#     'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
#     'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
# ]

target = ['B-EMAIL']

In [7]:
# Creating the dataset, labels and mapping dictionaries 

df, all_labels, label2id, id2label = load_data(target)

df_werner data = 755
df_mixtral data = 2355
kaggle train data = 6807
gemma data =  5479
df_mpware data = 2692
df_pj data = 2000
df_moth data = 4434
all_labels = ['B-EMAIL', 'O']


In [8]:
df_labels, label_classes, true_size = encode_labels(df.copy())

EMAIL: 9204  (37.5%)
others: 15318  (62.5%)


In [12]:
import pandas as pd

def split_df_by_sampling(df, n_samples, seed=None):
    """Split DataFrame into a sample and the remaining DataFrame."""
    samples_df = df.sample(n=n_samples, random_state=seed)
    others_df = df.drop(df.index[samples_df.index], inplace=False)
    return samples_df, others_df

def downsample_df(df, false_size):
    """Downsample DataFrame into training and validation datasets."""
    df['is_labels'] = df['labels'].apply(lambda labels: any(label != 'O' for label in labels))
    
    # Separate true and false labels
    true_labels = df[df['is_labels']]
    false_labels = df[~df['is_labels']] 
    
    # Reset index
    true_labels = true_labels.reset_index(drop=True)
    false_labels = false_labels.reset_index(drop=True)
    
    print(f"Number of true_labels = {len(true_labels)}")
    print(f"Number of false_labels = {len(false_labels)}")
    
    # Calculate number of samples for validation set
    n_samples_true = len(true_labels) - 150
    
    # Sample true labels
    true_samples, true_others = split_df_by_sampling(true_labels, n_samples_true, seed=42)
    print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
    
    # Sample false labels
    false_samples, false_others = split_df_by_sampling(false_labels, false_size, seed=42)
    false_others = false_others.sample(n=200, random_state=42)
    print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
    
    # Training dataset = P * true_labels + P * false_labels
    train_df = pd.concat([true_samples, false_samples])   
    # Validation dataset = (1-P) * true_labels + (1-P) * false_labels
    valid_df = pd.concat([true_others, false_others])   
    return train_df, valid_df


In [13]:
false_size = 12000

In [14]:
train_df, valid_df = downsample_df(df.copy(),false_size=false_size)

train_df = train_df.sample(frac=1).reset_index(drop=True)

train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

# train_df = train_df.sample(100)
# valid_df = valid_df.sample(100)

print(f"Number of train_df = {len(train_df)}")
print(f"Number of valid_df = {len(valid_df)}")
clear_memory()

Number of true_labels = 9204
Number of false_labels = 15318
true_samples = 9054 true_others = 150
false_samples = 12000 false_others = 200
Number of train_df = 21054
Number of valid_df = 350


In [15]:
def tokenize(example, tokenizer, label2id):
    """
    Tokenize input examples and map labels to their corresponding IDs.

    Args:
        example (dict): Input example containing tokens, provided_labels, trailing_whitespace.
        tokenizer: Tokenizer object.
        label2id (dict): Mapping of labels to their corresponding IDs.

    Returns:
        dict: Tokenized example with input_ids, attention_mask, offset_mapping, labels, and length.
    """
    # Initialize lists to store tokens and labels
    tokens = []
    labels = []
    
    # Iterate over tokens, labels, and trailing whitespaces
    for token, label, t_ws in zip(example["tokens"], 
                                  example["provided_labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        # Repeat label for each character in token
        labels.extend([label] * len(token))
        # Add trailing whitespace and label if true
        if t_ws:
            tokens.append(" ")
            labels.append("O")  
    
    # Concatenate tokens to form text
    text = "".join(tokens)
    
    # Tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True, max_length=4096, truncation=True)
    
    # Convert labels to numpy array
    labels = np.array(labels)
    
    # Initialize list to store token labels
    token_labels = []
    
    # Iterate over offset mappings
    for start_idx, end_idx in tokenized.offset_mapping:
        # Handle case when the text starts with whitespace
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])  # Add 'O' label
        else:
            try:
                # If text starts with whitespace, adjust start index
                if text[start_idx].isspace():
                    start_idx += 1
            except:
                token_labels.append(label2id['O'])
            # Convert label to ID
            try:
                label_id = label2id[labels[start_idx]]
                token_labels.append(label_id)
            except:
                continue

    # Return tokenized example with labels and length
    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}


In [17]:
def compute_metrics(preds, all_labels):
    """
    Compute precision, recall, and F1-score.

    Args:
        preds (tuple): Tuple containing predictions and true labels.
        all_labels (list): List of all possible labels.

    Returns:
        dict: Dictionary containing precision, recall, and F1-score.
    """
    try:
        predictions, labels = preds
        
        # Remove ignored index (-100) from predictions and labels
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([all_labels[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([all_labels[l] for p, l in zip(pred, label) if l != -100])
        
        # Compute recall, precision, and F1-score
        recall = recall_score(true_labels, true_preds)
        precision = precision_score(true_labels, true_preds)
        
        # Calculate modified F1-score
        f1_score = (1 + 5**2) * recall * precision / ((5**2) * precision + recall)
        
        # Store metrics in a dictionary
        result = {
            'f1': f1_score,
            'recall': recall,
            'precision': precision
        }
        
        # Print result for debugging or monitoring
        print(f"result = {result}")
        
        return result
    except Exception as e:
        # Handle any exceptions and print them for debugging
        print(e)

In [18]:
class ModelTrainer:
    def __init__(self, all_labels, label2id, id2label):
        # Initialize with labels and paths
        self.all_labels = all_labels
        self.label2id = label2id
        self.id2label = id2label
        self.model_path = "models/longformer_foundational/"
        self.save_path =  "models/longformer_foundational_email/"
        self.num_proc = 5
        
        # Hyperparameters
        self.learning_rate = 2e-5
        self.num_train_epochs = 3
        self.batch_size = 1
        self.grad_steps = 16 
        steps = len(train_df) // (self.batch_size * self.grad_steps) 
        
        # Training arguments
        self.training_args = TrainingArguments(
            output_dir="./models", 
            gradient_accumulation_steps=self.grad_steps,
            fp16=True,
            learning_rate=self.learning_rate,
            num_train_epochs=self.num_train_epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            report_to="none",
            evaluation_strategy="steps",
            eval_steps=steps // 4,
            do_eval=True,
            save_strategy="steps",
            save_steps=steps // 4,
            save_total_limit=2,
            logging_steps=steps // 4,
            lr_scheduler_type='linear',
            load_best_model_at_end=False,
            metric_for_best_model="f1",
            greater_is_better=True,
            warmup_ratio=0.1,
            weight_decay=0.01
        )
        
        # Load the model
        self.load_model()

    def load_model(self):
        # Create tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        config = LongformerConfig.from_pretrained(self.model_path)       
        config.update({
            'num_labels': len(self.all_labels),
            'id2label': self.id2label,
            'label2id': self.label2id,
            'ignore_mismatched_sizes': True,
        })
        self.model = AutoModelForTokenClassification.from_pretrained(
            self.model_path,
            config=config,
            ignore_mismatched_sizes=True
        )
        print(f"Complete loading pretrained LLM model") 

    def create_dataset(self, df):
        # Create tokenized dataset
        ds = Dataset.from_dict({
            "full_text": df["full_text"].tolist() ,
            "document": df["document"].astype('string'),
            "tokens": df["tokens"].tolist(),
            "trailing_whitespace": df["trailing_whitespace"].tolist(),
            "provided_labels": df["labels"].tolist()
        })
        tokenized_ds = ds.map(
            tokenize,
            fn_kwargs={"tokenizer": self.tokenizer, "label2id": self.label2id},
            num_proc=self.num_proc
        )
        return tokenized_ds

    def evaluate_saved_model(self, eval_df):
        # Evaluate saved model
        saved_model = AutoModelForTokenClassification.from_pretrained(self.save_path)
        saved_tokenizer = AutoTokenizer.from_pretrained(self.save_path)
        eval_ds = self.create_dataset(eval_df)
        data_collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=512)
        trainer = Trainer(
            model=saved_model,
            args=self.training_args,
            eval_dataset=eval_ds,
            data_collator=data_collator,
            tokenizer=saved_tokenizer,
            compute_metrics=partial(compute_metrics, all_labels=self.all_labels),
        )
        evaluation_result = trainer.evaluate()
        return evaluation_result
        
    def train(self, train_df, valid_df, resume=False):
        # Train the model
        training_ds = self.create_dataset(train_df)
        valid_ds = self.create_dataset(valid_df)
        data_collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=512)
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=training_ds,
            eval_dataset=valid_ds,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=partial(compute_metrics, all_labels=self.all_labels),
        )
        trainer.train(resume_from_checkpoint=resume)
        trainer.save_model(self.save_path)
        self.tokenizer.save_pretrained(self.save_path)
        print(f"Save the model to {self.save_path}")

In [19]:
trainer = ModelTrainer(all_labels, label2id, id2label)
trainer.train(train_df, valid_df)

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at models/longformer_foundational/ and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([13]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([13, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Complete loading pretrained LLM model


Map (num_proc=5):   0%|          | 0/21054 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/350 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1,Recall,Precision
328,0.0395,0.000696,0.999564,1.0,0.988794
656,0.0011,0.000571,0.989371,0.989207,0.993496
984,0.0009,0.00049,0.999544,1.0,0.988267
1312,0.001,0.000948,0.99944,1.0,0.985638
1640,0.0008,0.00065,0.99944,1.0,0.985638
1968,0.0006,0.000423,0.999564,1.0,0.988794
2296,0.0005,0.000552,0.999398,1.0,0.984591
2624,0.0006,0.000321,0.999751,1.0,0.993566
2952,0.0004,0.000242,0.999751,1.0,0.993566
3280,0.0005,0.000336,0.999751,1.0,0.993566


result = {'f1': 0.9995643063134091, 'recall': 1.0, 'precision': 0.9887940234791889}
result = {'f1': 0.9893709777870043, 'recall': 0.9892066918510524, 'precision': 0.9934959349593496}
result = {'f1': 0.9995435684647302, 'recall': 1.0, 'precision': 0.9882666666666666}
result = {'f1': 0.9994398921273727, 'recall': 1.0, 'precision': 0.9856382978723405}
result = {'f1': 0.9994398921273727, 'recall': 1.0, 'precision': 0.9856382978723405}
result = {'f1': 0.9995643063134091, 'recall': 1.0, 'precision': 0.9887940234791889}
result = {'f1': 0.9993984276142469, 'recall': 1.0, 'precision': 0.9845908607863975}
result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision': 0.9935656836461126}
result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision': 0.9935656836461126}
result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision': 0.9935656836461126}
result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision': 0.9935656836461126}
result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision

In [20]:
evaluation_result = trainer.evaluate_saved_model(valid_df)
print("Evaluation result:", evaluation_result)

Map (num_proc=5):   0%|          | 0/350 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


result = {'f1': 0.9997509856816768, 'recall': 1.0, 'precision': 0.9935656836461126}
Evaluation result: {'eval_loss': 0.00029711303068324924, 'eval_f1': 0.9997509856816768, 'eval_recall': 1.0, 'eval_precision': 0.9935656836461126, 'eval_runtime': 18.5082, 'eval_samples_per_second': 18.911, 'eval_steps_per_second': 18.911}
