<a href="https://colab.research.google.com/github/Pratik-Nikam/MachineLearning/blob/main/HDFC_FAQ_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from sklearn.model_selection import train_test_split
import time

# Check GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:

# Load data
with open('/content/sample_data/HDFC_Faq.txt', 'r') as f:
    data = json.load(f)
questions = [entry['question'] for entry in data]
answers = [entry['answer'] for entry in data]

# Split data
train_questions, temp_questions, train_answers, temp_answers = train_test_split(
    questions, answers, test_size=0.2, random_state=42
)
val_questions, test_questions, val_answers, test_answers = train_test_split(
    temp_questions, temp_answers, test_size=0.5, random_state=42
)

# Determine optimal max_length
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_lengths = [len(tokenizer.encode(q)) for q in questions] + [len(tokenizer.encode(a)) for a in answers]
max_length = min(max(tokenized_lengths) + 10, 512)
print(f"Using max_length: {max_length}")

# Tokenize
train_encodings = tokenizer(train_questions, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")
train_labels_encodings = tokenizer(train_answers, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")
val_encodings = tokenizer(val_questions, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")
val_labels_encodings = tokenizer(val_answers, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")

# Prepare decoder inputs and labels
train_decoder_input_ids = train_labels_encodings["input_ids"][:, :-1]
train_labels = train_labels_encodings["input_ids"][:, 1:]
val_decoder_input_ids = val_labels_encodings["input_ids"][:, :-1]
val_labels = val_labels_encodings["input_ids"][:, 1:]
train_decoder_attention_mask = train_labels_encodings["attention_mask"][:, :-1]
val_decoder_attention_mask = val_labels_encodings["attention_mask"][:, :-1]

# Create datasets
batch_size = 8
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "decoder_input_ids": train_decoder_input_ids,
        "decoder_attention_mask": train_decoder_attention_mask
    },
    train_labels
)).batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": val_encodings["input_ids"],
        "attention_mask": val_encodings["attention_mask"],
        "decoder_input_ids": val_decoder_input_ids,
        "decoder_attention_mask": val_decoder_attention_mask
    },
    val_labels
)).batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)


In [None]:

# Load and train model
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5))

start_time = time.time()
model.fit(train_dataset, validation_data=val_dataset, epochs=1, verbose=1)
print(f"Training time: {(time.time() - start_time) / 60:.2f} minutes")


In [None]:

# Save model
model.save_pretrained("/content/fine_tuned_t5_model")

# Generate answer
def generate_answer(question, model, tokenizer):
    inputs = tokenizer(question, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")
    outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=100, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

loaded_model = TFT5ForConditionalGeneration.from_pretrained("/content/fine_tuned_t5_model")
print(f"Predicted Answer: {generate_answer('How do I change my password?', loaded_model, tokenizer)}")