# Step 1 - Installing the required dependencies 
Before we can begin we need to make sure we have all the required dependencies installed in our notebook kernel. You will also want to ensure that you have the configured the correct runtime in the notebook (e.g. GPU or CPU)

In [None]:
# In order to avoid future dependency issues we have frozen the versions. 
# This means you may have to alter these as time goes by and new releases
# are available. 
!pip install transformers==4.25.1
!pip install datasets==2.8.0
!pip install evaluate==0.4.0
!pip install accelerate==0.15.0

# Step 2 - Persisting models and accessing training data
We need a way to persist our models and tokenizers along with an easy way to pull the training set without having to deal with uploading/downloading to a new runtime. This will save a lot of headache and give us the ability to infer from the model in a separate notebook seamlessly. 

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/', force_remount=True)

# Create two new working directories if they do not already exist
import os
from os import path

new_paths = ['/content/drive/MyDrive/models','/content/drive/MyDrive/training_data']
for p in new_paths:
  if path.exists(p) == False:
    os.mkdir(p)

# IMPORTANT: At this point you will need to upload a text file containing your training data 
# to the /content/drive/MyDrive/training_data directory with the name training-set.txt.
# You only have to do this once unless you want to use new training data. 

# Step 3 - The Setup - Loading our model and tokenizer
Here we will fetch our base foundation model and its associated tokenizer. Depending on which LLM you choose the Auto feature will determine the optimal downloads and load them into the appropriate variables so that we can fine-tune and retrain in next steps. It may take a while to download very large models so have a cup of coffee in the meantime. 

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM
import time
import torch
import os
import numpy as np
import sklearn

# Lets create some inline metrics for future reference
start = time.time()

print("Loading model")
# We recommend one of the following: EleutherAI/gpt-neo-125M, EleutherAI/gpt-j-6B, EleutherAI/gpt-neo-1.3B, EleutherAI/gpt-neo-2.7B
# NOTE: For any model greater than 125M parameters you are going to need Premium GPU 
model_name = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", 
                                             device_map="auto")
model.config.pad_token_id = model.config.eos_token_id

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", 
                                          device_map="auto")
tokenizer.pad_token = tokenizer.eos_token

# Step 4 - Preparing our Training Set
Now that we have our model and tokenizer we can use the huggingface dataset library and tokenizer to prepare our train and test sets to train the base foundation model. 

In [None]:
from datasets import load_dataset

print("Loading dataset")

training_set = "/content/drive/MyDrive/training_data/training-set.txt"

# TODO: Add a test set for eval
# Here we want to load the dataset and sample by paragraph 
current_dataset = load_dataset("text", data_files={"train": training_set, 
                                                   "test": training_set}, 
                               sample_by="paragraph")

current_dataset['train'] = current_dataset['train']

# Once we have extracted text by paragraph we need this function to convert it 
# into the tokens that are expected by the model. 
def tokenize_function(examples):
    current_tokenizer_result = tokenizer(examples["text"], padding="max_length", truncation=True)
    return current_tokenizer_result


print("Tokenizing dataset")
tokenized_datasets = current_dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"] #.select(range(75))
small_eval_dataset = small_train_dataset


# Step 5 - Model Training
Excellent we have everything we need as input for training the model in an unsupervised fashion. Now lets begin the training. Keep in mind this may take some time depending on your hardware setup and the chosen model. 

In [None]:
import numpy as np
import evaluate
import sklearn
import torch, gc

# I have added this here in order to free as much memory as possible right before
# we go into training as it is quite memory intense. 
gc.collect()
# Lets test for cuda if you are using GPU
if torch.cuda.is_available():
  torch.cuda.empty_cache()
  print(torch.cuda.memory_summary(device=None, abbreviated=False))

print("Preparing training arguments")
# If you are running on CPU you can change no_cuda to True
training_args = TrainingArguments(output_dir=new_paths[0],
                                  report_to='all',
                                  logging_dir='./logs',
                                  per_device_train_batch_size=1,
                                  label_names=['input_ids', 'attention_mask'],  # 'logits', 'past_key_values'
                                  num_train_epochs=1,
                                  no_cuda=False,
                                  )

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

print("Starting training")
trainer.train()
print(f"Finished fine-tuning in {time.time() - start}")

In [None]:
# Don't forget to save our tokenizer, model checkpoints for inference! 
trainer.save_model()
tokenizer.save_pretrained(new_paths[0])

# Next Steps - Inference 
Great job! You've trained your model now lets get to the fun part inference. Head on over to the colab_expert_system_inference.ipynb to test out your model. 