In [5]:
!pip install trl



In [6]:
!pip install -U bitsandbytes
!pip install -U transformers




###Importing Required Libraries

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset


###Importing Dataset



In [8]:
from google.colab import files

# Upload a file from local machine
uploaded = files.upload()

In [33]:
train_data = pd.read_csv("process_train.csv")
validation_data = pd.read_csv("process_validation.csv")

# Remove the 'id' column from train_data and validation_data
train_data = train_data.drop(columns=['id'], errors='ignore')
validation_data = validation_data.drop(columns=['id'], errors='ignore')
# Display a sample
validation_data.head()

Unnamed: 0,dialogue,summary,formatted
0,"edd: wow, did you hear that they're transferri...",rose and edd will be transferred to a new depa...,### Instruction:\nSummarize the following conv...
1,"tom: where is the ""sala del capitolo"" kevin: i...","""sala del capitolo"" tom is looking for is in t...",### Instruction:\nSummarize the following conv...
2,patricia: the rowing practice is cancelled! ka...,the rowing practice is cancelled. a few member...,### Instruction:\nSummarize the following conv...
3,"tom: u ok? alex: yeah, pretty good. u? tom: a...",tom and alex had fun last night. they drank a ...,### Instruction:\nSummarize the following conv...
4,"patricia: hello, here's the fair-trade brand i...",patricia recommends a fair-trade brand she tal...,### Instruction:\nSummarize the following conv...


In [34]:
# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)

# Check that the conversion was successful
train_data.head()

Unnamed: 0,dialogue,summary,formatted
0,violet: hi! i came across this austin's articl...,violet sent claire austin's article.,### Instruction:\nSummarize the following conv...
1,pat: so does anyone know when the stream is go...,pat and lou are waiting for the stream but kev...,### Instruction:\nSummarize the following conv...
2,jane: jane: whaddya think? shona: this ur tin...,jane is updating her tinder profile tonight an...,### Instruction:\nSummarize the following conv...
3,"adam: do u have a map of paris? tom: yes, why?...",tom has a map of paris.,### Instruction:\nSummarize the following conv...
4,"frank: hi, how's the family? mike: great! sam'...","mike is happy, because sam's moved out. mike a...",### Instruction:\nSummarize the following conv...


### Setting Tokenizer

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [12]:


# Add a new padding token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Preprocessing function for tokenization
def preprocess_function(examples):
    # Tokenize the dialogue and summary
    inputs = tokenizer(examples["dialogue"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
val_dataset = validation_data.apply(preprocess_function, axis=1)

In [35]:
# Define the model ID for GPT-2
model_id = "gpt2"

# Configure the quantization (if applicable)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with the quantization configuration and device mapping
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Print model and tokenizer details for verification
print(model)
print(tokenizer)

model.resize_token_embeddings(len(tokenizer))

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_a

Embedding(50257, 768)

###Summarizing before predicting

In [16]:
index = 0

dialogue = validation_dataset['dialogue'][index]
summary =  validation_dataset['summary'][index]

In [17]:
prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'Input Prompt:\n{prompt}')
print(dash_line)
print(f'Label Summar:\n{summary}\n')
print(dash_line)
print(f'Model Summary\n{output}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


---------------------------------------------------------------------------------------------------
Input Prompt:

Summarize the following conversation.

### Input:
edd: wow, did you hear that they're transferring us to a different department? rose: whaaaaat :o rose: no! where'd you hear that? edd: well, it's quite official edd: anderson just told us rose: and do you know what it changes for us? edd: they won't change the professors edd: but i know the paperwork will get trickier rose: and i guess that is a move that is supposed to make everything easier edd: yeah, guess so edd: they have a funny way of understanding 'to make things easier'

### Summary:

---------------------------------------------------------------------------------------------------
Label Summar:
rose and edd will be transferred to a new department. their professors will not change but paperwork will become more difficult.

--------------------------------------------------------------------------------------------

In [18]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [36]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [20]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_a

In [37]:
from peft import LoraConfig, get_peft_model
# Define Lora configuration for GPT2
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["attn.c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj"],  # Adapted for GPT-2 architecture
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get the PEFT model using the configuration
model = get_peft_model(model, lora_config)

# Print the number of trainable parameters to verify
model.print_trainable_parameters()


trainable params: 2,359,296 || all params: 126,799,104 || trainable%: 1.8607


In [38]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_dir="./logs",  # Log directory for TensorBoard
    logging_strategy="epoch",  # Log after each epoch
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=5,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    warmup_ratio=0.05,
    save_strategy="epoch",  # Save checkpoint after each epoch
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

model.config.use_cache = False



### Training the Model

In [39]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=lora_config,
    dataset_text_field="formatted",
    max_seq_length=1024,
    args=training_arguments,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,2.8974,2.507707
2,2.568,2.458498
3,2.5189,2.435549
4,2.4922,2.423244
5,2.4816,2.422737


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1250, training_loss=2.591608740234375, metrics={'train_runtime': 975.179, 'train_samples_per_second': 20.509, 'train_steps_per_second': 1.282, 'total_flos': 2020040213889024.0, 'train_loss': 2.591608740234375, 'epoch': 5.0})

In [40]:
from pathlib import Path

# Define the directories for saving
model_dir = "/content/Trained Model/model"
tokenizer_dir = "/content/Trained Model/tokenizer"

# Create the directories if they don't exist
Path(model_dir).mkdir(parents=True, exist_ok=True)
Path(tokenizer_dir).mkdir(parents=True, exist_ok=True)

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_dir)

print(f"Model saved to: {model_dir}")
print(f"Tokenizer saved to: {tokenizer_dir}")


Model saved to: /content/Trained Model/model
Tokenizer saved to: /content/Trained Model/tokenizer


In [41]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
import shutil

# Source and destination paths
source = "/content/Trained Model"
destination = "/content/drive/MyDrive/GPT2Model"

# Copy the folder
shutil.copytree(source, destination, dirs_exist_ok=True)

print(f"Folder copied from {source} to {destination}")


Folder copied from /content/Trained Model to /content/drive/MyDrive/GPT2Model


In [43]:
index = 0

dialogue = validation_dataset['dialogue'][index]
summary =  validation_dataset['summary'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'Input Prompt:\n{prompt}')
print(dash_line)
print(f'Label Summar:\n{summary}\n')
print(dash_line)
print(f'Model Summary\n{output}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


---------------------------------------------------------------------------------------------------
Input Prompt:

Summarize the following conversation.

### Input:
edd: wow, did you hear that they're transferring us to a different department? rose: whaaaaat :o rose: no! where'd you hear that? edd: well, it's quite official edd: anderson just told us rose: and do you know what it changes for us? edd: they won't change the professors edd: but i know the paperwork will get trickier rose: and i guess that is a move that is supposed to make everything easier edd: yeah, guess so edd: they have a funny way of understanding 'to make things easier'

### Summary:

---------------------------------------------------------------------------------------------------
Label Summar:
rose and edd will be transferred to a new department. their professors will not change but paperwork will become more difficult.

--------------------------------------------------------------------------------------------