In [1]:
! pip install transformers datasets trl torch peft

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.2-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.2-py3-none-any.whl (365 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m31.4 MB/s[0

In [2]:
# from datasets import load_dataset
# from trl import SFTConfig, SFTTrainer
# from peft import LoraConfig

# dataset = load_dataset("stanfordnlp/imdb", split="train")

# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# trainer = SFTTrainer(
#     "EleutherAI/gpt-neo-125m",
#     train_dataset=dataset,
#     args=SFTConfig(output_dir="/tmp",report_to= 'none'),
#     peft_config=peft_config,
# )

# trainer.train()

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType

In [4]:
# 1. Load Pre-trained Model and Tokenizer
model_name = "gpt2"  # Can be any Hugging Face causal language model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set a padding token for GPT-2
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# 2. Define PEFT Configuration (LoRA)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,          # Rank of LoRA matrices
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn"],  # Adjusted for GPT-2
)
model = get_peft_model(model, peft_config)



In [7]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
       

In [8]:
# 3. Prepare Custom Dataset
data_1 =  {
    "text": [
        "Artificial intelligence is revolutionizing many industries by automating tasks and improving decision-making.",
        "Machine learning models are used in applications like predictive analytics and natural language processing.",
        "AI can assist in various sectors such as healthcare, finance, and transportation by analyzing large datasets.",
        "Deep learning techniques are powering advancements in computer vision, enabling machines to interpret images and video.",
        "AI technologies like robotics are being deployed to perform tasks that were once considered too complex for machines."
    ]
}

dataset_1 = Dataset.from_dict(data_1)

In [9]:
data_2 = {
    "text": [
        "AI is transforming oncology by helping in the early detection of cancer through advanced imaging techniques.",
        "Machine learning algorithms are increasingly used to analyze tumor biopsies and predict cancer progression.",
        "In precision oncology, AI is utilized to create personalized treatment plans based on a patient's genetic profile.",
        "AI-driven systems are improving the accuracy of radiology by identifying tumors in medical scans like CT and MRI images.",
        "Artificial intelligence is accelerating drug discovery, identifying potential cancer therapies faster than traditional methods."
    ]
}

dataset_2 = Dataset.from_dict(data_2)

In [10]:
def tokenize_function(example):
    tokenized = tokenizer(example["text"], padding="max_length", truncation=True, max_length=32)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add labels for loss computation
    return tokenized

In [11]:
# 4. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=3,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="no",  # Skip saving for simplicity
    learning_rate=2e-4,
    fp16=True,
    report_to= 'none',
)

In [12]:
tokenized_dataset1 = dataset_1.map(tokenize_function, batched=True)
tokenized_dataset2 = dataset_2.map(tokenize_function, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset2

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})

In [14]:
# 5. Fine-Tune the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset2,  # Ensure pad token is properly handled
)
trainer.train()

Step,Training Loss


TrainOutput(global_step=6, training_loss=6.141793568929036, metrics={'train_runtime': 2.4045, 'train_samples_per_second': 6.238, 'train_steps_per_second': 2.495, 'total_flos': 245810626560.0, 'train_loss': 6.141793568929036, 'epoch': 3.0})

In [15]:
# 6. Perform Inference
# model.eval()
# input_text = "AI is transforming the "
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# output = model.generate(inputs["input_ids"],
#                         max_length=50,
#                         num_return_sequences=1,
#                         temperature=0.7,  # Controls randomness in output; lower = more deterministic
#                         top_p=0.9,  # Nucleus sampling: restricts the next token to a top-probability set
#                         no_repeat_ngram_size=2,  # Prevents repetition of n-grams
#                         do_sample=True  # Enables sampling instead of greedy decoding (which is deterministic)
#                         )
# print(tokenizer.decode(output[0], skip_special_tokens=True))

To resolve the warning, you need to explicitly provide an attention_mask along with the input IDs during inference. Additionally, you should set pad_token_id explicitly for text generation.

In [16]:
# 6. Perform Inference
model.eval()
input_text = "AI in oncology"
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda")

# Explicitly set pad_token_id
# Set parameters to influence generation quality
output = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Pass attention mask
    max_length=100,  # Limit the length to avoid overly verbose answers
    num_return_sequences=2,  # Generate multiple options for comparison
    pad_token_id=tokenizer.eos_token_id,  # Ensure pad token is properly handled
    temperature=0.7,  # Adjust temperature for controlled randomness (lower means less randomness)
    top_p=0.9,  # Increase top_p for more diverse but focused output
    top_k=50,  # Top-k sampling limits the pool of options for better quality
    no_repeat_ngram_size=2,  # Prevent repeating n-grams
    do_sample=True,  # Enable sampling for more creative responses
)

# Decode tokens and print the output
print("Generated Responses:")
for i, generated_sequence in enumerate(output):
    print(f"Option {i+1}: {tokenizer.decode(generated_sequence, skip_special_tokens=True)}\n")

Generated Responses:
Option 1: AI in oncology.

The following is a quick summary of the information we have gathered on the subject. We will not be making any statements about the efficacy of drugs. Instead, we will be providing you with information about what's in your system. As always, our goal is to provide a comprehensive overview of this topic, so you can make informed decisions. However, you may wish to consult with your healthcare provider before taking any medication.

Option 2: AI in oncology and neurophysiology.

"The neurobiological basis of Parkinson's disease is poorly understood and largely unknown. We are now able to investigate the mechanisms of neurodegeneration in the mouse model of the disease and to establish the role of dopamine and norepinephrine in controlling neuropathology," says Dr. John M. A. Bickley, PhD, from the Department of Neurobiology and Neurosciences at the University of Oxford and the lead author

