#Installing the required libraries

In [1]:
!pip install trl

Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━

In [2]:
!pip install -U bitsandbytes
!pip install -U transformers


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled

###Importing Required Libraries

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset


###Importing Dataset



In [4]:
from google.colab import files

# Upload a file from local machine
uploaded = files.upload()

Saving process_train.csv to process_train.csv
Saving process_validation.csv to process_validation.csv


In [5]:
train_data = pd.read_csv("process_train.csv")
validation_data = pd.read_csv("process_validation.csv")

# Remove the 'id' column from train_data and validation_data
train_data = train_data.drop(columns=['id'], errors='ignore')
validation_data = validation_data.drop(columns=['id'], errors='ignore')
# Display a sample
validation_data.head()

Unnamed: 0,dialogue,summary,formatted
0,"edd: wow, did you hear that they're transferri...",rose and edd will be transferred to a new depa...,### Instruction:\nSummarize the following conv...
1,"tom: where is the ""sala del capitolo"" kevin: i...","""sala del capitolo"" tom is looking for is in t...",### Instruction:\nSummarize the following conv...
2,patricia: the rowing practice is cancelled! ka...,the rowing practice is cancelled. a few member...,### Instruction:\nSummarize the following conv...
3,"tom: u ok? alex: yeah, pretty good. u? tom: a...",tom and alex had fun last night. they drank a ...,### Instruction:\nSummarize the following conv...
4,"patricia: hello, here's the fair-trade brand i...",patricia recommends a fair-trade brand she tal...,### Instruction:\nSummarize the following conv...


In [6]:
# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)

# Check that the conversion was successful
train_data.head()

Unnamed: 0,dialogue,summary,formatted
0,violet: hi! i came across this austin's articl...,violet sent claire austin's article.,### Instruction:\nSummarize the following conv...
1,pat: so does anyone know when the stream is go...,pat and lou are waiting for the stream but kev...,### Instruction:\nSummarize the following conv...
2,jane: jane: whaddya think? shona: this ur tin...,jane is updating her tinder profile tonight an...,### Instruction:\nSummarize the following conv...
3,"adam: do u have a map of paris? tom: yes, why?...",tom has a map of paris.,### Instruction:\nSummarize the following conv...
4,"frank: hi, how's the family? mike: great! sam'...","mike is happy, because sam's moved out. mike a...",### Instruction:\nSummarize the following conv...


### Setting Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:


# Add a new padding token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Preprocessing function for tokenization
def preprocess_function(examples):
    # Tokenize the dialogue and summary
    inputs = tokenizer(examples["dialogue"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
val_dataset = validation_data.apply(preprocess_function, axis=1)

In [7]:
# Define the model ID for llama
model_id = "NousResearch/Llama-2-7b-hf"

# Configure the quantization (if applicable)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with the quantization configuration and device mapping
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Print model and tokenizer details for verification
print(model)
print(tokenizer)

model.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

Embedding(32000, 4096, padding_idx=0)

###Summarizing before predicting

In [8]:
index = 0

dialogue = validation_dataset['dialogue'][index]
summary =  validation_dataset['summary'][index]

In [9]:
prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'Input Prompt:\n{prompt}')
print(dash_line)
print(f'Label Summar:\n{summary}\n')
print(dash_line)
print(f'Model Summary\n{output}')



---------------------------------------------------------------------------------------------------
Input Prompt:

Summarize the following conversation.

### Input:
edd: wow, did you hear that they're transferring us to a different department? rose: whaaaaat :o rose: no! where'd you hear that? edd: well, it's quite official edd: anderson just told us rose: and do you know what it changes for us? edd: they won't change the professors edd: but i know the paperwork will get trickier rose: and i guess that is a move that is supposed to make everything easier edd: yeah, guess so edd: they have a funny way of understanding 'to make things easier'

### Summary:

---------------------------------------------------------------------------------------------------
Label Summar:
rose and edd will be transferred to a new department. their professors will not change but paperwork will become more difficult.

--------------------------------------------------------------------------------------------

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [14]:
from peft import LoraConfig, get_peft_model
# Define Lora configuration for Llama-2-7B
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Updated for Llama-2 architecture
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
# Get the PEFT model using the configuration
model = get_peft_model(model, lora_config)

# Print the number of trainable parameters to verify
model.print_trainable_parameters()


trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.2484


In [16]:

# Define output directory
OUTPUT_DIR = "/context"

In [19]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_dir="./logs",  # Log directory for TensorBoard
    logging_strategy="epoch",  # Log after each epoch
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    warmup_ratio=0.05,
    save_strategy="epoch",  # Save checkpoint after each epoch
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

model.config.use_cache = False



### Training the Model

In [20]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=lora_config,
    dataset_text_field="formatted",
    max_seq_length=1024,
    args=training_arguments,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.6087,1.594861


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=250, training_loss=1.608736572265625, metrics={'train_runtime': 3942.6722, 'train_samples_per_second': 1.015, 'train_steps_per_second': 0.063, 'total_flos': 3.377729300280115e+16, 'train_loss': 1.608736572265625, 'epoch': 1.0})

#Saving the Model

In [21]:
from pathlib import Path

# Define the directories for saving
model_dir = "/content/Trained Model/model"
tokenizer_dir = "/content/Trained Model/tokenizer"

# Create the directories if they don't exist
Path(model_dir).mkdir(parents=True, exist_ok=True)
Path(tokenizer_dir).mkdir(parents=True, exist_ok=True)

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_dir)

print(f"Model saved to: {model_dir}")
print(f"Tokenizer saved to: {tokenizer_dir}")


Model saved to: /content/Trained Model/model
Tokenizer saved to: /content/Trained Model/tokenizer


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
import shutil

# Source and destination paths
source = "/content/Trained Model"
destination = "/content/drive/MyDrive/models/text-summarization"

# Copy the folder
shutil.copytree(source, destination, dirs_exist_ok=True)

print(f"Folder copied from {source} to {destination}")


Folder copied from /content/Trained Model to /content/drive/MyDrive/models/text-summarization


#Testing

In [24]:
index = 0

dialogue = validation_dataset['dialogue'][index]
summary =  validation_dataset['summary'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'Input Prompt:\n{prompt}')
print(dash_line)
print(f'Label Summar:\n{summary}\n')
print(dash_line)
print(f'Model Summary\n{output}')



---------------------------------------------------------------------------------------------------
Input Prompt:

Summarize the following conversation.

### Input:
edd: wow, did you hear that they're transferring us to a different department? rose: whaaaaat :o rose: no! where'd you hear that? edd: well, it's quite official edd: anderson just told us rose: and do you know what it changes for us? edd: they won't change the professors edd: but i know the paperwork will get trickier rose: and i guess that is a move that is supposed to make everything easier edd: yeah, guess so edd: they have a funny way of understanding 'to make things easier'

### Summary:

---------------------------------------------------------------------------------------------------
Label Summar:
rose and edd will be transferred to a new department. their professors will not change but paperwork will become more difficult.

--------------------------------------------------------------------------------------------