In [None]:
#Download your corpus for stage 1 (pre-training on selected data)

In [None]:
# Download target datasets (stage 2) from https://github.com/allenai/dont-stop-pretraining

In [None]:
#another command to download said target datasets
# !curl -Lo reviews_amazon_test.jsonl https://allennlp.s3-us-west-2.amazonaws.com/dont_stop_pretraining/data/amazon/test.jsonl


# MLM training - stage 1: Pre-training on selected data

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import numpy as np
import pandas as pd 


In [None]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import pickle
import lzma

In [None]:
# Define hyperparameters for MLM fine-tuning
model_name = "bert-base-uncased"
batch_size = 64
epochs_mlm = 1
learning_rate_mlm = 1e-4
data_path_mlm = 'imdb_150k.pkl'  # Update with your selected data's path for pre-training


In [None]:
#Load selected pre-training data 

#an example of loading input data for pickled datasets

import gzip
# Load and preprocess your data for MLM fine-tuning
with open(data_path_mlm, 'rb') as file:
    data_mlm = pickle.load(file)


In [None]:
#tokenize the selected pre-training dataset. We choose a max length of 256

tokenizer_mlm = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
inputs_mlm = tokenizer_mlm(data_mlm, return_tensors="pt", padding=True, truncation=True, max_length=256)
data_collator_mlm = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_mlm,
    mlm=True,
    mlm_probability=0.15,  # Mask 15% of tokens,
    return_tensors="pt"
)

# Create a training configuration for MLM fine-tuning
training_args_mlm = TrainingArguments(
    output_dir="./bert_mlm_finetuned_50k_imdb",
    overwrite_output_dir=True,
    num_train_epochs=epochs_mlm,
    per_device_train_batch_size=batch_size,
    save_steps=5000,
    save_total_limit=2,
    learning_rate=learning_rate_mlm
)

# Load the pre-trained BERT model for MLM fine-tuning
model_mlm = AutoModelForMaskedLM.from_pretrained(model_name)


In [None]:
#Convert tokenized data to Pyarrow style Dataset ( HuggingFace Datasets)
from datasets import Dataset, DatasetDict
dataset = Dataset.from_dict(inputs_mlm)


In [None]:
# Create a Trainer instance for MLM fine-tuning
#This trainer uses Parallel GPU processing
trainer_mlm = Trainer(
    model=model_mlm,
    args=training_args_mlm,
    data_collator=data_collator_mlm,
    train_dataset=dataset
    
)

# Fine-tune the model for MLM 
trainer_mlm.train()

In [None]:
trainer_mlm.save_model("./bert_mlm_finetuned_150K_imdb")

# Fine-tuning on downstream classification tasks - Stage 2

In [None]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer

# Define hyperparameters for classification fine-tuning
epochs_classification = 10
learning_rate_classification = 1e-4
batch_size_classification = 64
model_name = "bert-base-uncased"

In [None]:
from datasets import load_dataset
data_path_classification = 'reviews_imdb_train.jsonl'  # Update with your classification data path to the downloaded target datasets
test_data_path = 'reviews_imdb_test.jsonl' 

# Load and preprocess classification data
classification_data = load_dataset('json', data_files=data_path_classification)
classification_test_data = load_dataset('json', data_files=test_data_path)


In [None]:
#tokenize the train and test sets
tokenizer_classification = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
tokenized_data_classification = tokenizer_classification(
    list(classification_data['train']['text']),
    padding=True,
    truncation=True,
    max_length=256,  # Adjust max_length as needed
    return_tensors="pt"
)


tokenized_test_data_classification = tokenizer_classification(
    list(classification_test_data['train']['text']),
    padding=True,
    truncation=True,
    max_length=256,  # Adjust max_length as needed
    return_tensors="pt"
)


In [None]:

id2label=dict([(i, x) for i, x in enumerate(list(np.unique(classification_data['train']['label'])))])
label2id= dict([(x,i) for i, x in enumerate(list(np.unique(classification_data['train']['label'])))])


In [None]:
#store label information in a separate variable
train_label=classification_data['train']['label']
for i in range(len(train_label)):
    train_label[i]=label2id[train_label[i]]
dev_label=classification_dev_data['train']['label']
for i in range(len(dev_label)):
    dev_label[i]=label2id[dev_label[i]]    
test_label=classification_test_data['train']['label']
for i in range(len(test_label)):
    test_label[i]=label2id[test_label[i]]

In [None]:
# Create a classification model from weights obtained after MLM pre-training our model
model_classification = AutoModelForSequenceClassification.from_pretrained("./bert_mlm_finetuned_150K_imdb",
                                                                          num_labels=2, 
#                                                                           id2label=id2label, 
#                                                                           label2id=label2id
                                                                     )


In [None]:
# Convert the tokenized target datasets to Huggingface Datasets
from datasets import Dataset, DatasetDict

train_dataset_classification = Dataset.from_dict(
    {
        'input_ids': tokenized_data_classification['input_ids'],
        'attention_mask': tokenized_data_classification['attention_mask'],
        'labels':train_label
    }
)

dev_dataset_classification = Dataset.from_dict(
    {
        'input_ids': tokenized_dev_data_classification['input_ids'],
        'attention_mask': tokenized_dev_data_classification['attention_mask'],
        'labels':dev_label
    }
)

test_dataset_classification = Dataset.from_dict(
    {
        'input_ids': tokenized_test_data_classification['input_ids'],
        'attention_mask': tokenized_test_data_classification['attention_mask'],
        'labels': test_label
    }
)



In [None]:
# set compute Metrics to F1-score. Note that Biomed Domain uses 'micro' F1 score, the rest use 'macro'
import evaluate

accuracy = evaluate.load("f1")
#f1 score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels,average="macro")

In [None]:
# Define data collator for classification
data_collator_classification = DataCollatorWithPadding(tokenizer_classification, return_tensors="pt")


In [None]:
# Create a TrainingArguments object for classification fine-tuning
training_args_classification = TrainingArguments(
    output_dir="./bert_classification",
    overwrite_output_dir=True,
    num_train_epochs=epochs_classification,
    per_device_train_batch_size=batch_size_classification,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=learning_rate_classification,
    evaluation_strategy="steps",
    eval_steps=1000,  
    logging_steps=100,  
    seed=43,
    load_best_model_at_end=True
)


In [None]:
###### Create a Trainer instance for classification fine-tuning
trainer_classification = Trainer(
    model=model_classification,
    args=training_args_classification,
    data_collator=data_collator_classification,
    train_dataset=train_dataset_classification,
    eval_dataset=test_dataset_classification,  # Use the test dataset for evaluation
    compute_metrics =compute_metrics
)

# Fine-tune the model for classification
trainer_classification.train()


In [None]:
#Evaluate the best final model over the test set for performance
trainer_classification.evaluate(test_dataset_classification)

In [None]:
#save the best model
trainer_classification.save_model("./bert_classification/reviews_imdb_150K")