In [10]:
# !pip install accelerate transformers
# !pip install sentencepiece
# !pip install datasets
# !pip install evaluate
# !pip install pytorch_lightning
# !pip install scikit-learn

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

There was a problem when trying to write in your cache folder (/home/jupyter/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


In [4]:
model_id= "google/flan-t5-xl"

tokenizer = T5Tokenizer.from_pretrained(model_id)
# model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto",load_in_8bit=True)


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration  
from transformers import AdamW
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.utils.rnn import pad_sequence
pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

Global seed set to 100


In [12]:
from datasets import Dataset, DatasetDict
import pandas as pd
import requests
import json

url = "https://huggingface.co/datasets/ehartford/samantha-data/resolve/main/samantha-1.1.json"
response = requests.get(url)
data = response.json()

json_data = pd.json_normalize(data)  # Adjust depending on the structure of your JSON data

print(json_data.head())


  id                                      conversations
0  0  [{'from': 'human', 'value': 'Hey Samantha, I'v...
1  1  [{'from': 'human', 'value': 'Hey Samantha, I'v...
2  2  [{'from': 'human', 'value': 'Hello Samantha, I...
3  3  [{'from': 'human', 'value': 'Hey Samantha, I'v...
4  4  [{'from': 'human', 'value': 'Hello Samantha, a...


In [13]:
questions = []
answers = []
# Iterate through the DataFrame to reformat the data
for _, row in json_data.iterrows():
    conversations = row['conversations']
    for i in range(0, len(conversations) - 1):
        if i + 1 < len(conversations):
            questions.append(conversations[i]['value'])
            answers.append(conversations[i + 1]['value'])


new_data = pd.DataFrame({'question': questions, 'answer': answers})
new_data.to_csv('reformatted_data_reversed.csv', index=False)


In [15]:
size=round(len(new_data)*0.5)
print(size)

31420


In [16]:
reduced_data = new_data[:size]

In [17]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 128 #input length
OUTPUT_MAX_LEN = 128 # output length
TRAIN_BATCH_SIZE = 2 # batch size of training
VAL_BATCH_SIZE = 2 # batch size for validation
EPOCHS = 5 # number of epoch

In [18]:
text = "Hello, how are you today?" 
input_tokenize = tokenizer( 
             text,
            add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
            max_length=128,
            padding = 'max_length',         #for padding to max_length for equal sequence length
            truncation = True,              #truncate the text if it is greater than max_length
            return_attention_mask=True,     #will return attention mask
            return_tensors="pt"             #return tensor formate
        )

In [19]:

class T5Dataset:
    def __init__(self,question,answer):   

        self.question = question
        self.answer = answer
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.output_max_len = OUTPUT_MAX_LEN
  
    def __len__(self):                      # This method retrives the number of item from the dataset
        return len(self.question)

    def __getitem__(self,item):             # This method retrieves the item at the specified index item. 

        question = str(self.question[item])
        question = ''.join(question.split())

        answer = str(self.answer[item])
        answer = ''.join(answer.split())

        input_tokenize = self.tokenizer(      
                question,
                add_special_tokens=True,
                max_length=self.input_max_len,
                padding = 'max_length',
                truncation = True,
                return_attention_mask=True,
                return_tensors="pt"
            )
        output_tokenize = self.tokenizer(
                answer,
                add_special_tokens=True,
                max_length=self.output_max_len,
                padding = 'max_length',
                truncation = True,
                return_attention_mask=True,
                return_tensors="pt"

            )


        input_ids = input_tokenize["input_ids"].flatten()
        attention_mask = input_tokenize["attention_mask"].flatten()
        labels = output_tokenize['input_ids'].flatten()

        out = {
                'question':question,      
                'answer':answer,
                'input_ids': input_ids,
                'attention_mask':attention_mask,
                'target':labels
            }

        return out      

In [20]:
class T5DataLoad(pl.LightningDataModule):
    
    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN
    
    def setup(self, stage=None):
        
        self.train_data = T5Dataset(
            question = self.df_train.question.values,
            answer = self.df_train.answer.values
        )
        
        self.valid_data = T5Dataset(
            question = self.df_test.question.values,
            answer = self.df_test.answer.values
        )
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True, 
         num_workers=2
        )
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE,
        num_workers = 2
        )

In [21]:
class T5Model(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto",load_in_8bit=True, return_dict=True)

        
    def forward(self, input_ids, attention_mask, labels=None):
        
        output = self.model(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)

        
        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)

        self.log("val_loss", loss, prog_bar=True, logger=True)
        
        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [22]:
from huggingface_hub import notebook_login
notebook_login() 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
def run():
    df_train, df_test = train_test_split(reduced_data,test_size = 0.2, random_state=100)
    dataload = T5DataLoad(df_train,df_test)
    dataload.setup()
    device = DEVICE
    model = T5Model()
    model.to(device)
    
    checkpoint = ModelCheckpoint(
        dirpath="/home/jupyter/notebooks/flan_t5_model/",
        filename='best-model',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks = checkpoint,
        max_epochs= 1,
        accelerator="gpu"
    )
    trainer.fit(model, dataload)


In [18]:
run()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 2.8 B 
-----------------------------------------------------
635 M     Trainable params
2.2 B     Non-trainable params
2.8 B     Total params
11,399.029Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 7541: 'val_loss' reached 0.80771 (best 0.80771), saving model to '/home/jupyter/notebooks/flan_t5_model/best-model.ckpt' as top 2
`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
train_model = T5Model.load_from_checkpoint('/home/jupyter/notebooks/flan_t5_model/best-model.ckpt')
train_model.freeze()

In [20]:
def generate_question(question):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        ).to(DEVICE)

    
    generate_ids = train_model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        ).to(DEVICE)

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

In [21]:
ques = "hi, how are you doing?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

Ques:  hi, how are you doing?
BOT:  I'mgladtoheartyou!It'sbeenapleasingtalkingtoyou,andI'malwaysheretodiscussanythingyou'dliketosharewithme.Rememberthatmypurposeistoprovidecompanionshipandemotionalsupportwithintheboundariesofourfriendship.
