In [1]:
import json, argparse, torch, sys, random, gc, os, ast
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path

# Transformer 
from transformers import (AutoTokenizer, Trainer, TrainingArguments,
                          AutoModelForTokenClassification, DataCollatorForTokenClassification,
                          LongformerConfig, LongformerForTokenClassification, BitsAndBytesConfig)
from datasets import Dataset, features
from typing import Iterable, Any, Callable
from sklearn.model_selection import train_test_split
import peft
from peft import get_peft_config, get_peft_model, PeftModel, PeftConfig, LoraConfig, TaskType
import math

2024-04-20 09:36:36.023749: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

SEED = 42
seed_everything(SEED)

In [3]:
import ctypes
libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

In [4]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


In [5]:
# 1-hot encoding 
from sklearn.preprocessing import MultiLabelBinarizer

def replace_labels(row, labels):
    return [label if label in labels else 'O' for label in row]
    

def load_data(labels):
    files = os.listdir("extra-data/individual_labels")
    csv_files = [f for f in files if f.endswith(".csv")]
    
    df_werner = pd.DataFrame()
    
    for file in csv_files:
    
        temp = pd.read_csv(f"extra-data/individual_labels/{file}")
        temp.rename(columns={'text' : "full_text"}, inplace=True)
        temp["tokens"] = temp["tokens"].apply(ast.literal_eval)
        temp["trailing_whitespace"] = temp["trailing_whitespace"].apply(ast.literal_eval)
        temp["labels"] = temp["labels"].apply(ast.literal_eval)
        df_werner = pd.concat([df_werner,temp]).drop_duplicates(subset=['document'], keep='first')
    
    print(f"df_werner data = {len(df_werner)}")
    

    df_mixtral = pd.read_json("extra-data/mixtral-8x7b-v1.json")
    print(f"df_mixtral data = {len(df_mixtral)}")

    
    train_data = pd.read_json("pii-detection-removal-from-educational-data/train.json")
    print(f"kaggle train data = {len(train_data)}") # 6807
    # Texts generated by Gemma
    gemma_df = pd.read_json("extra-data/pii_dataset_Gemma.json")
    print("gemma data = ", len(gemma_df)) # 1390
    # PII - Mixtral8x7B generated essays (2692)
    df_mpware = json.load(open('extra-data/mpware_mixtral8x7b_v1.1-no-i-username.json'))
    df_mpware = pd.DataFrame(df_mpware)    
    df_mpware = df_mpware[train_data.columns]
    print(f"df_mpware data = {len(df_mpware)}")

    df_pj = pd.read_json('extra-data/moredata_dataset_fixed.json')
    print(f"df_pj data = {len(df_pj)}")
    df_pj.rename(columns={'text' : "full_text"}, inplace=True)
    
    df_moth = pd.read_json('extra-data/pii_dataset_fixed.json')
    df_moth.rename(columns={'text' : "full_text"}, inplace=True)
    print(f"df_moth data = {len(df_moth)}")
    
    # Combine to a single df
    df = pd.concat([train_data, gemma_df, df_mpware, df_pj, df_moth, df_werner,df_mixtral])
    df['document'] = [i for i in range(len(df))] # Update the document
    df.reset_index(drop=True, inplace=True)
    df['labels'] = df['labels'].apply(replace_labels, args=(labels,))
     # Get all the unique labels 
    all_labels = sorted(np.unique(functools.reduce(lambda a, b: list(np.unique(a+b)),
                                                  df['labels'].tolist())))
    print(f"all_labels = {all_labels}")
    # Create indexes for labels
    label2id = {label:index for index,label in enumerate(all_labels)}
    id2label = {index:label for index,label in enumerate(all_labels)}
    return df, all_labels, label2id, id2label
    
# Encode labels to columns
def encode_labels(df):
    total = len(df)
    df["unique_labels"] = df["labels"].apply(lambda labels: 
                                            list(set([label.split('-')[1] for label in labels if label != 'O'])))
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df['unique_labels'])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    df = pd.concat([df, one_hot_df], axis=1)
    # add 'POS' column that don't have 
    df['others'] = df['unique_labels'].apply(lambda x: 1 if len(x) == 0 else 0)
    label_classes = list(mlb.classes_) + ['others']
    for col in label_classes:
        subtotal = df[col].sum()
        percent = subtotal/total * 100
        print(f'{col}: {subtotal}  ({percent:.1f}%)')
    return df, label_classes, subtotal

In [6]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [7]:
df, all_labels, label2id, id2label = load_data(target)

df_werner data = 4117
df_mixtral data = 2355
kaggle train data = 6807
gemma data =  5479
df_mpware data = 2692
df_pj data = 2000
df_moth data = 4434
all_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']


In [8]:
df_labels, label_classes, true_size = encode_labels(df.copy())

EMAIL: 9204  (33.0%)
ID_NUM: 5417  (19.4%)
NAME_STUDENT: 14729  (52.8%)
PHONE_NUM: 9489  (34.0%)
STREET_ADDRESS: 11505  (41.3%)
URL_PERSONAL: 5832  (20.9%)
USERNAME: 6649  (23.8%)
others: 7228  (25.9%)


In [9]:
def split_df_by_sampling(df, n_samples):
    # Get the sample df
    samples_df = df.sample(n=n_samples, random_state=SEED)
    # The remaining df
    cond = df['document'].isin(samples_df['document'])
    others_df = df.drop(df[cond].index, inplace=False)
    return samples_df, others_df

def downsample_df(df,false_size):
    '''Split the df into training and valid dataset'''
    df['is_labels'] = df['labels'].apply(lambda labels: any(label != 'O' for label in labels))
    # One or more labels are not 'O'
    true_labels = df[df['is_labels'] == True]
    # all labels are 'O'
    false_labels = df[df['is_labels'] == False] 
    # Reset index to two df
    true_labels = true_labels.reset_index(drop=True, inplace=False)
    false_labels = false_labels.reset_index(drop=True, inplace=False)
    print(f"Number of true_labels = {len(true_labels)}")
    print(f"Number of false_labels = {len(false_labels)}")
    # Get 300 as valid dataset
    n_samples=len(true_labels) - 50
    # Sample true labels
    true_samples, true_others = split_df_by_sampling(true_labels, n_samples)
    print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
    n_samples=false_size
    # Sample false labels
    false_samples, false_others = split_df_by_sampling(false_labels, n_samples)
    false_others = false_others.sample(n = 200)
    print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
    # Training ds = P * true_labels + P * false_labels
    train_df = pd.concat([true_samples, false_samples])   
    # Valid ds = (1-P) * true_labels + (1-P) * false_labels
    valid_df = pd.concat([true_others, false_others])   
    return train_df, valid_df

In [10]:
false_size = 7000

In [11]:
train_df, valid_df = downsample_df(df.copy(),false_size=false_size)

train_df = train_df.sample(frac=1).reset_index(drop=True)

train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

# train_df = train_df.sample(100)
# valid_df = valid_df.sample(100)

print(f"Number of train_df = {len(train_df)}")
print(f"Number of valid_df = {len(valid_df)}")
clear_memory()

Number of true_labels = 20656
Number of false_labels = 7228
true_samples = 20606 true_others = 50
false_samples = 7000 false_others = 200
Number of train_df = 27606
Number of valid_df = 250


In [12]:
def tokenize(example, tokenizer, label2id):
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], 
                                  example["provided_labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and 
        if t_ws:
            tokens.append(" ")
            labels.append("O")  
    
    text = "".join(tokens)
    # print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True, max_length = 4096,
                          truncation=True)
    labels = np.array(labels)
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O' 
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"]) 
        else:
            # case when the text starts with whitespace
            try:
                if text[start_idx].isspace():
                    start_idx += 1
            except:
                token_labels.append(label2id['O'])
            # Convert label to id (int)
            try:
                label_id = label2id[labels[start_idx]]
                token_labels.append(label_id)
            except:
                continue

    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [13]:
from seqeval.metrics import recall_score, precision_score, f1_score

In [14]:
def compute_metrics(preds, all_labels):    
    try:
        #print("Compute metrics")
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)
        # Include prediction Remove ignored index (special tokens)
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([all_labels[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([all_labels[l] for p, l in zip(pred, label) if l != -100])
        # Compute recall, precision and f1 score
        recall = recall_score(true_labels, true_preds)
        precision = precision_score(true_labels, true_preds)
        # Use modified f1 score to measure the performance
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
        result = {'f1': f1_score,  
                  'recall': recall,
                  'precision': precision}
        print(f"result = {result}")
        return result
    except Exception as e: 
        print(e)

In [15]:
class ModelTrainer:
    def __init__(self, all_labels, label2id, id2label):
        
        self.all_labels = all_labels
        self.label2id = label2id
        self.id2label = id2label
        
        self.model_path = "allenai/longformer-base-4096"
        self.save_path =  "models/longformer_foundational_all"
        self.num_proc = 5
        
        self.learning_rate = 1e-5
        self.num_train_epochs = 3 # Number of epochs
        self.batch_size = 2
        self.load_model()
        # self.grad_steps = 16 
        steps = len(train_df) // self.batch_size 
        
        self.training_args = TrainingArguments(output_dir="./models", 
                                          # gradient_accumulation_steps=self.grad_steps,
                                          fp16=True,
                                          learning_rate=self.learning_rate,
                                          num_train_epochs=self.num_train_epochs, # The total number of training epochs to run.
                                          per_device_train_batch_size=self.batch_size,  # batch size per device during training
                                          per_device_eval_batch_size=self.batch_size, # batch size for evaluation
                                          report_to="none",
                                          evaluation_strategy="steps", # Evaluated at the end of epochs
                                          eval_steps=steps // 2,
                                          do_eval=True,
                                          save_strategy="steps",
                                          save_steps=steps // 2,
                                          save_total_limit=2, # Save the best and most recent checkpoints
                                          logging_steps=steps // 2,
                                          lr_scheduler_type='cosine',
                                          load_best_model_at_end=True, # Load the best model at the end
                                          metric_for_best_model="f1",
                                          greater_is_better=True,
                                          warmup_ratio = 0.05, 
                                          weight_decay=0.001, 
                                         )

    # Load the model
    def load_model(self):
        # Create the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 
        # Load tokenizer config
        config = LongformerConfig.from_pretrained(self.model_path)       
        # Increase context length using the max_position_embeddings parameter 
        config.update({
            'num_labels': len(self.all_labels),
            'id2label': self.id2label,
            'label2id': self.label2id,
            'ignore_mismatched_sizes': True,
            'layer_norm_eps' : 1e-7,
        })

        self.model = AutoModelForTokenClassification.from_pretrained(self.model_path,
                                                                     config=config, ignore_mismatched_sizes = True)
        
        print(f"Complete loading pretrained LLM model") 

    # Convert df to tokenized dataset
    def create_dataset(self, df):
        ds = Dataset.from_dict({
            "full_text": df["full_text"].tolist() ,
            "document": df["document"].astype('string'),
            "tokens": df["tokens"].tolist(),
            "trailing_whitespace": df["trailing_whitespace"].tolist(),
            "provided_labels": df["labels"].tolist()
        })
         # Tokenize the dataset
        tokenized_ds = ds.map(tokenize, 
                              fn_kwargs={"tokenizer": self.tokenizer, 
                                         "label2id": self.label2id,
                                        },
                              num_proc=self.num_proc)
        return tokenized_ds

    # Evaluate the saved model
    def evaluate_saved_model(self, eval_df):
        # Load the saved model and tokenizer
        saved_model = AutoModelForTokenClassification.from_pretrained(self.save_path)
        saved_tokenizer = AutoTokenizer.from_pretrained(self.save_path)

        # Create dataset for evaluation
        eval_ds = self.create_dataset(eval_df)

        # Data collator
        data_collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=512)

        # Evaluate the model
        trainer = Trainer(model=saved_model,
                          args=self.training_args,
                          eval_dataset=eval_ds,
                          data_collator=data_collator,
                          tokenizer=saved_tokenizer,
                          compute_metrics=partial(compute_metrics, all_labels=self.all_labels),
                          )

        evaluation_result = trainer.evaluate()

        return evaluation_result
        
    # Train the model
    def train(self, train_df, valid_df,resume=False):
        # Create training dataset
        training_ds = self.create_dataset(train_df)
        # Create valid dataset
        valid_ds = self.create_dataset(valid_df)
        # Data collator
        data_collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=512)               
        # Trainer cofiguration
        
        # Pass the modelTrainer
        trainer = Trainer(model=self.model, 
                          args=self.training_args, 
                          train_dataset=training_ds,
                          eval_dataset=valid_ds, 
                          data_collator=data_collator, 
                          tokenizer=self.tokenizer,
                          compute_metrics=partial(compute_metrics, all_labels=all_labels),
                         )
        # Train the model
        trainer.train(resume_from_checkpoint=resume)
        # Save the model
        trainer.save_model(self.save_path)
        self.tokenizer.save_pretrained(self.save_path)
        print(f"Save the model to {self.save_path}")


In [None]:
trainer = ModelTrainer(all_labels, label2id, id2label)
trainer.train(train_df, valid_df,resume=True)

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Complete loading pretrained LLM model


Map (num_proc=5):   0%|          | 0/27606 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/250 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss


In [None]:
# evaluation_result = trainer.evaluate_saved_model(valid_df)
# print("Evaluation result:", evaluation_result)