In [1]:
import pandas as pd

df = pd.read_json("hf://datasets/toughdata/quora-question-answer-dataset/Quora-QuAD.jsonl", lines=True) #pd.read_csv('qa_data.csv') #

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df.head()

Unnamed: 0,question,answer
0,Why whenever I get in the shower my girlfriend...,Isn’t it awful? You would swear that there was...
1,"What is a proxy, and how can I use one?",A proxy server is a system or router that prov...
2,"What song has the lyrics ""someone left the cak...",MacArthur's Park\n
3,I am the owner of an adult website called http...,Don't let apps that are liers put adds on your...
4,Does the Bible mention anything about a place ...,St. John in the book of Revelation mentions an...


In [3]:
# Data preprocessing

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# init lemmatizer
lemmatizer = WordNetLemmatizer()
# Regular expression pattern for matching URLs
url_pattern = r'http\S+|www\S+|https\S+'

def clean_text(text):
    
    # Lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # Remove emojis, Hindi characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Remove URLs from text
    text = re.sub(url_pattern, '', text, flags=re.MULTILINE)

    return text

df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)



In [5]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration

class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize inputs and labels
        input_encoding = self.tokenizer(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': torch.squeeze(input_encoding['input_ids']),
            'attention_mask': torch.squeeze(input_encoding['attention_mask']),
            'labels': torch.squeeze(target_encoding['input_ids'])
        }

        return item


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Split dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small', use_fast=True)
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create datasets and dataloaders
max_length = 128
train_dataset = QADataset(
    train_df['question'].tolist(),
    train_df['answer'].tolist(),
    tokenizer,
    max_length
)
val_dataset = QADataset(
    val_df['question'].tolist(),
    val_df['answer'].tolist(),
    tokenizer,
    max_length
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Set device
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
print("CUDA Available:", torch.cuda.is_available())

CUDA Available: True


In [8]:
# Training loop
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()
    
    print(f"Validation Loss after Epoch {epoch + 1}: {val_loss / len(val_loader)}")

Epoch 1/4 completed
Validation Loss after Epoch 1: 2.489588534865771
Epoch 2/4 completed
Validation Loss after Epoch 2: 2.4361423161144633
Epoch 3/4 completed
Validation Loss after Epoch 3: 2.4125878017279647
Epoch 4/4 completed
Validation Loss after Epoch 4: 2.396877258425056


In [20]:
model.save_pretrained("fine-tuned-t5")
tokenizer.save_pretrained("fine-tuned-t5")

('fine-tuned-t5\\tokenizer_config.json',
 'fine-tuned-t5\\special_tokens_map.json',
 'fine-tuned-t5\\spiece.model',
 'fine-tuned-t5\\added_tokens.json')

In [19]:
question = "what industry need lead generation and marketing"

# Encode the input
input_ids = tokenizer.encode(question, return_tensors='pt').to(device)
# Generate the answer
outputs = model.generate(input_ids, max_length=100, early_stopping=True)

# Decode the generated answer
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:", answer)

Answer: lead generation and marketing are essential for various industries, a it involves a wide range of b2b (business-to-business) services, a it involves a combination of a wide range of b2b (business-to-business) services, a it involves a combination of a wide range of b2b (business-to-business) services, a it involves a combination of a wide range of b
