<a href="https://colab.research.google.com/github/Norawit29/emoji_model/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_from_disk, concatenate_datasets, load_dataset

import accelerate
import transformers
import torch


### Training a tokenzer

In [None]:
# Train a Byte level BPE tokenizer

# Define the path to the text files for tokenizer training
paths = ["path_to_text_files"]

# Initialize the Byte level BPE tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize the tokenizer training
tokenizer.train(
    files=paths,
    vocab_size=52_000,
    min_frequency=2,
    show_progress=True,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ]
)

# Save the trained tokenizer model to a specified directory
tokenizer.save_model("path_to_save_model_directory")


### Prepare dataset for model training

#### The training dataset including
20% of tweets including emoji ds (3,776,718),
all of MIMIC III ds (2,083,180),
all of MIMIC IV discharge summary ds (331,794),
5% of pubmed ds (1,002,469)



In [None]:
# Concatenate multiple datasets into one
dataset_tmp = concatenate_datasets([tweets_ds_n, mimic_ds_n, pubmed_ds_n, mimicIV_ds_n])

# Split the concatenated dataset into training and validation sets
dataset_split_tmp = dataset_tmp.train_test_split(test_size=0.2, seed=42, shuffle=True)

trains_ds_tmp = dataset_split_tmp['train']
vals_ds_tmp = dataset_split_tmp['test']

# Define a function to tokenize and encode dataset examples
def encode_dataset(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

# Tokenize and encode the training dataset
trains_encode_tmp = trains_ds_tmp.map(encode_dataset, batched=True)
trains_encode_tmp.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Tokenize and encode the validation dataset
vals_encode_tmp = vals_ds_tmp.map(encode_dataset, batched=True)
vals_encode_tmp.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Save the processed datasets to disk
trains_encode_tmp.save_to_disk("path_to_save_training_dataset")
vals_encode_tmp.save_to_disk("path_to_save_validation_dataset")



### Train a transformer model

In [None]:
# Checking available GPU

import torch
import torch.cuda
torch.cuda.empty_cache()

print(f'PyTorch version: {torch.__version__}')
print('*'*10)
print(f'_CUDA version: ')
!nvcc --version
print('*'*10)
print(f'CUDNN version: {torch.backends.cudnn.version()}')
print(f'Available GPU devices: {torch.cuda.device_count()}')
print(f'Device Name: {torch.cuda.get_device_name()}')
print('*'*10)
print(f"GPU IS AVAILABLE {torch.cuda.is_available()}")


In [None]:
# Load the training and validation datasets
train_dataset = load_from_disk("path_to_training_dataset")
val_dataset = load_from_disk("path_to_validation_dataset")

# Define the RoBERTa model configuration
model_config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

# Initialize the model on GPU if available
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = RobertaForMaskedLM(config=model_config).to(device)
print('Number of parameters: ', model.num_parameters())

# Initialize the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("path_to_pretrained_tokenizer_model")
tokenizer.model_max_length = 512

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="path_to_output_directory",
    overwrite_output_dir=False,
    evaluation_strategy='epoch',
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    seed=1,
    auto_find_batch_size=True,
    remove_unused_columns=False
)

# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save the trained model
trainer.save_model("path_to_save_trained_model")


### Train a sentence transformer model
(the code was from sentence transformer model official site
https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py)

In [None]:
import sys
import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
from datetime import datetime
import gzip
import os
import tarfile
from collections import defaultdict
from torch.utils.data import IterableDataset
import tqdm
from torch.utils.data import Dataset
import random
import pickle
import argparse


In [None]:
## Read the MS Marco dataset
data_folder = " "


## Read the corpus files, that contain all the passages. Store them in the corpus dict

corpus = {}         #dict in the format: passage_id -> passage. Stores all existent passages
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)
    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

with open(collection_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        pid = int(pid)
        corpus[pid] = passage


In [None]:
## Read the train queries, store in queries dict

queries = {}        #dict in the format: query_id -> query. Stores all training queries
queries_filepath = os.path.join(data_folder, 'queries.train.tsv')
if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
    if not os.path.exists(tar_filepath):
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz', tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)


with open(queries_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        qid = int(qid)
        queries[qid] = query


In [None]:
# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
# to the CrossEncoder score computed by the cross-encoder/ms-marco-MiniLM-L-6-v2 model

ce_scores_file = os.path.join(data_folder, 'cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz')
if not os.path.exists(ce_scores_file):
    util.http_get('https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz', ce_scores_file)

with gzip.open(ce_scores_file, 'rb') as fIn:
    ce_scores = pickle.load(fIn)


In [None]:
# Use hard-negatives that have been mined

hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives.jsonl.gz')
ce_score_margin = 3
num_negs_per_system = 5

train_queries = {}
negs_to_use = None
with gzip.open(hard_negatives_filepath, 'rt') as fIn:
    for line in tqdm.tqdm(fIn):
        data = json.loads(line)

        #Get the positive passage ids
        qid = data['qid']
        pos_pids = data['pos']

        if len(pos_pids) == 0:  #Skip entries without positives passages
            continue

        pos_min_ce_score = min([ce_scores[qid][pid] for pid in data['pos']])
        ce_score_threshold = pos_min_ce_score - ce_score_margin

        #Get the hard negatives
        neg_pids = set()
        if negs_to_use is None:
            if args.negs_to_use is not None:    #Use specific system for negatives
                negs_to_use = args.negs_to_use.split(",")
            else:   #Use all systems
                negs_to_use = list(data['neg'].keys())

        for system_name in negs_to_use:
            if system_name not in data['neg']:
                continue

            system_negs = data['neg'][system_name]
            negs_added = 0
            for pid in system_negs:
                if ce_scores[qid][pid] > ce_score_threshold:
                    continue

                if pid not in neg_pids:
                    neg_pids.add(pid)
                    negs_added += 1
                    if negs_added >= num_negs_per_system:
                        break

        if args.use_all_queries or (len(pos_pids) > 0 and len(neg_pids) > 0):
            train_queries[data['qid']] = {'qid': data['qid'], 'query': queries[data['qid']], 'pos': pos_pids, 'neg': neg_pids}

del ce_scores


In [None]:
## Load previously train transformer model

roberta = models.Transformer("path_to_save_trained_model")
pooler = models.Pooling(
    roberta.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[roberta, pooler])
model.max_seq_length = 300



In [None]:
# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
# on-the-fly based on the information from the mined-hard-negatives jsonl file.

class MSMARCODataset(Dataset):
    def __init__(self, queries, corpus):
        self.queries = queries
        self.queries_ids = list(queries.keys())
        self.corpus = corpus

        for qid in self.queries:
            self.queries[qid]['pos'] = list(self.queries[qid]['pos'])
            self.queries[qid]['neg'] = list(self.queries[qid]['neg'])
            random.shuffle(self.queries[qid]['neg'])

    def __getitem__(self, item):
        query = self.queries[self.queries_ids[item]]
        query_text = query['query']

        pos_id = query['pos'].pop(0)    #Pop positive and add at end
        pos_text = self.corpus[pos_id]
        query['pos'].append(pos_id)

        neg_id = query['neg'].pop(0)    #Pop negative and add at end
        neg_text = self.corpus[neg_id]
        query['neg'].append(neg_id)

        return InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(self.queries)


In [None]:
# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.

train_dataset = MSMARCODataset(train_queries, corpus=corpus)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128)
train_loss = losses.MultipleNegativesRankingLoss(model=model)


In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=5,
          warmup_steps=args.warmup_steps,
          use_amp=True,
          checkpoint_path="path_to_ckpt",
          checkpoint_save_steps=len(train_dataloader),
          optimizer_params = {'lr': args.lr},
          )

# Save the model
model.save("path_to_save_trained_sentence_transformer_model")
