In [1]:
# This Notebook is based on a clone of
# Joao Fernando Ferreira Goncalves
# original notebook that can be found here:
# https://github.com/Joaoffg/AISocIMP23/
#
# See original notebook here:
# https://colab.research.google.com/drive/1ya-NLUyfbs0lZJu1pQIkhoA4POtTsVkv?usp=sharing
#
# Changes to the original notebook was made, according to
# to talks with Joao, after Mads and I visited
# Rotterdam Univerity in January 2024.
#
# March 4, 2024
# sila

# Training a language model using Transformers

Training a language model is a more advanced form of machine learning,
therefore we need to install some libraries that are not default for Google Colab.
We also clone our GitHub repository as usual.

In [2]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install accelerate
!git clone https://github.com/Joaoffg/AISocIMP23/

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16
Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected pa

After installing the libraries, we need to import them so we can use them in out code.

In [3]:
from datasets import load_dataset
import re
import os
import torch
from transformers import LlamaTokenizer, LlamaConfig, LlamaModel, LlamaForCausalLM, Trainer, TrainingArguments

In [5]:
# Now you must add your own documents to the folder Texts.
# Here I have added a text about Proshop

In [6]:
%ls "/content/AISocIMP23/Week 4/Texts"

 Proshop.txt  'ThesisDraft_MariaPalaciosBarea_622509 (1).txt'


This part loads all of the text data that you will use to train the language model.

In [7]:
dataset = load_dataset("text",
                       data_dir="/content/AISocIMP23/Week 4/Texts")


Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# What you must do:
# 1. Try to run the complete notebook based om this text.
# 2. Then re-run notebook, but now with more texts added, and see what
#     happens as information from these new texts books are queried.
#
# Now we should be ready to the second part.

This part tokenizes the dataset, which means that it converts all of the words into numbers that can be processed by the neural network.

In [10]:
tokenizer = LlamaTokenizer.from_pretrained('/content/AISocIMP23/Week 4/Token')

def chunk_examples(examples,chunk_lenght=128, min_chunk_lenght = 25):
    chunks = []
    for text in examples["text"]:
        tokenized = tokenizer(text,add_special_tokens=False)
        if len(tokenized.input_ids) > min_chunk_lenght:
            input_ids = [tokenizer.bos_token_id] + tokenized.input_ids + [tokenizer.eos_token_id]
            attention_mask = [1] + tokenized.attention_mask + [1]
            for i in range(0, len(tokenized.input_ids), chunk_lenght):
                cunk_input_ids = input_ids[i:i + chunk_lenght]
                cunk_att_mask = attention_mask[i:i + chunk_lenght]
                cur_chunk_len = len(cunk_input_ids)

                if  cur_chunk_len < chunk_lenght:
                    cunk_input_ids = cunk_input_ids + [tokenizer.eos_token_id]*(chunk_lenght - cur_chunk_len)
                    cunk_att_mask = cunk_att_mask + [0]*(chunk_lenght - cur_chunk_len)

                chunks += [{"input_ids":torch.tensor(cunk_input_ids),
                        "attention_mask": torch.tensor(cunk_att_mask),
                        "labels": torch.tensor(cunk_input_ids)}
                        #"raw":tokenizer.decode(cunk_input_ids)}

                        ]


    return {"chunks": chunks}


chunked_dataset = dataset.map(chunk_examples, batched=True, remove_columns=['text'])

Here we define our model architecture, we are using a LlaMa based model for this exercise. You can change the complexity factor below to make the model more simple or more complex.

In [11]:
complexity_reduction=2

config = LlamaConfig(
    vocab_size = 32000,
    hidden_size= int(2048/complexity_reduction),
    intermediate_size = int(5120/complexity_reduction),
    num_hidden_layers = int(16/complexity_reduction),
    num_attention_heads = int(16/complexity_reduction),
    max_position_embeddings = 2048 ,
    rms_norm_eps = 1e-12
)

model = LlamaForCausalLM(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"LlaMa Model Size: {model_size/1000**2:.1f}M parameters")

tokenizer.pad_token = tokenizer.eos_token

LlaMa Model Size: 162.0M parameters


Here you define the training arguments. You can ignore most of them, they are default values, but you may want to tweak the batch_sizes and the learning rate.

In [12]:
args = TrainingArguments(
    output_dir="erasmian-lm/medium",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="no",
    eval_steps=5_000,
    logging_steps=5_000,
    num_train_epochs=5,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=0.0001,
    save_steps=5_000,
    fp16=True,
    save_strategy = "epoch", #save only latest model at end of epoch instead of 5k steps.
    save_total_limit = 1 # save only latest 3 epochs
)


In [13]:
# Note: Try num_train_epochs=5, for improved performance.

This step trains the model, exciting!

In [14]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=chunked_dataset['train']['chunks']

)

model = torch.compile(model)
trainer.train()

Step,Training Loss


TrainOutput(global_step=2970, training_loss=3.0138303609006734, metrics={'train_runtime': 692.3717, 'train_samples_per_second': 4.29, 'train_steps_per_second': 4.29, 'total_flos': 294824116224000.0, 'train_loss': 3.0138303609006734, 'epoch': 5.0})

And here you can test how the model is actually performing by generating some text. You can write the start of the text between "" in the input_text field.

In [56]:
from transformers import GenerationConfig

input_text= "Proshop is"

generation_config = GenerationConfig(
    temperature=1,
    top_p=0.95, #0.95
    top_k=50,#50
    repetition_penalty=1,
    do_sample=True,
    num_return_sequences=1
)

inputs = tokenizer(input_text, return_tensors="pt")
inputs = inputs.to("cuda:0")
model = model.to("cuda:0")
outputs = model.generate(**inputs, num_beams=1, do_sample=True, max_length=128)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Proshop is anshop a system’s sells products, the Netherlandsing between strength and household in the warehouse. This can be largely products as being more than other, but it is in the same, not require things than being considered a man.']


In [57]:
# Now try to change the various settings, parameters.
#
# E.g.
# Asking what 'Proshop is' now gives a somewhat better answer:
# ['Proshop is a/s sells products in eight countries: Denmark, Norway, Austria, Austria, where not be not by it.']
# 'Proshop a/s sells products' gives:
# ['Proshop a/s sells products in eight countries: Denmark, Norway, Austria, the results in a degree of skill and gender.']
# Changing temperature to 0.1, top_p = 0.99, top_k = 10
# makes the answer less imaginative, and is gives:
#['Proshop a/s sells products in eight countries: Denmark, Finland, Sweden, Poland, co and Germany. The company.']
# Raising the temperature to 10 gives:
# ['Proshop a/s sells products in higher countries: Denmark: Denmark, Finland, the person, Finland, aiming to find women who are “ay, co.']
# Changing temperature to 1, top_p = 0.99, top_k = 1 gives:
# I.e. By specifying a Top K of 50, we’re saying, -Only look at the best 50 tokens-
# Top K lets us limit how many options we consider while sampling.
# Top P says, - Only consider the possibilities that equal or exceed this value.- It gives:
# ['Proshop a/s sells products in eight countries: Denmark, Norway, though, Austria, Austria, the Netherlands and Germany.']

# With - Proshop is an online -
# temperature 0.1, top_p = 0.99, top_k = 1
# it gives: ['Proshop is an online shop that sellsells Aar ']
# Well, at least shop looked ok.
#
# Resetting to 'Proshop is"
# temperature 1, top_p = 0.95, top_k = 50
# we get:
# ['Proshop is anshop a system’s sells products, the Netherlandsing between strength and household in the warehouse.']
# At least there is something about shop, sell products and warehouse in it !

In [60]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [61]:
!nvidia-smi

Mon Mar  4 16:17:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0              33W /  70W |   3555MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    