In [1]:
#!pip install transformers datasets torch tensorflow scikit-learn gradio pandas numpy



# Load the data

In [3]:

import pandas as pd

# Load dataset
df = pd.read_csv("data/train.csv", names=["Question", "Answer"])




In [4]:
# Display dataset sample
print(df.head())
print(df.tail())

                                                          Question  \
qtype                                                     Question   
susceptibility   Who is at risk for Lymphocytic Choriomeningiti...   
symptoms         What are the symptoms of Lymphocytic Choriomen...   
susceptibility   Who is at risk for Lymphocytic Choriomeningiti...   
exams and tests  How to diagnose Lymphocytic Choriomeningitis (...   

                                                            Answer  
qtype                                                       Answer  
susceptibility   LCMV infections can occur after exposure to fr...  
symptoms         LCMV is most commonly recognized as causing ne...  
susceptibility   Individuals of all ages who come into contact ...  
exams and tests  During the first phase of the disease, the mos...  
                                                      Question  \
symptoms     What are the symptoms of Familial visceral myo...   
information              What is 

In [5]:
import numpy as np
import pandas as pd
import re
import random
import transformers
import matplotlib.pyplot as plt
import json
import pickle
import torch

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import metrics
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
print(df.columns)




Index(['Question', 'Answer'], dtype='object')


In [7]:
# Display the first few rows to understand the structure
print("First few rows:")
print(df.head())

# Check the structure of the DataFrame
print("\nDataFrame Info:")
print(df.info())

# Get descriptive statistics for a quick numerical overview
print("\nDescriptive Statistics:")
print(df.describe())

# Check for missing values in each column
print("\nMissing Values:")
print(df.isnull().sum())


First few rows:
                                                          Question  \
qtype                                                     Question   
susceptibility   Who is at risk for Lymphocytic Choriomeningiti...   
symptoms         What are the symptoms of Lymphocytic Choriomen...   
susceptibility   Who is at risk for Lymphocytic Choriomeningiti...   
exams and tests  How to diagnose Lymphocytic Choriomeningitis (...   

                                                            Answer  
qtype                                                       Answer  
susceptibility   LCMV infections can occur after exposure to fr...  
symptoms         LCMV is most commonly recognized as causing ne...  
susceptibility   Individuals of all ages who come into contact ...  
exams and tests  During the first phase of the disease, the mos...  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 16408 entries, qtype to information
Data columns (total 2 columns):
 #   Column    Non-

# Preprocess the data 

In [9]:
from transformers import AutoTokenizer

# Define the model name
MODEL_NAME = "t5-small"  

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function for sequence-to-sequence training
def tokenize_function(row):
    # Tokenize the question (input)
    model_inputs = tokenizer(row["Question"],
                               padding="max_length",
                               truncation=True,
                               max_length=512)
    # Tokenize the answer (target) separately
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(row["Answer"],
                           padding="max_length",
                           truncation=True,
                           max_length=512)
    # Assign labels (target token ids) to the input dictionary
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to each row of the DataFrame
# This produces a Series of dictionaries
tokenized_data = df.apply(tokenize_function, axis=1)
tokenized_examples = tokenized_data.tolist()  # Convert to a list if needed

#  Print one example to inspect
print(tokenized_examples[0])





{'input_ids': [11860, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [44]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Splitting into training and validation sets (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Question"], df["Answer"], test_size=0.2, random_state=42
)

# Convert text to tokenized input IDs
train_encodings = tokenizer(list(train_texts), list(train_labels), truncation=True, padding=True, return_tensors="tf")
val_encodings = tokenizer(list(val_texts), list(val_labels), truncation=True, padding=True, return_tensors="tf")

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_encodings["input_ids"]
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
    val_encodings["input_ids"]
))

# Batch and shuffle
BATCH_SIZE = 8
train_dataset = train_dataset.shuffle(len(train_texts)).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [46]:
import tensorflow as tf
import tensorflow.keras as keras
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# Define optimizer and loss function for T5
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train the model
EPOCHS = 5
model.fit(train_dataset, epochs=EPOCHS)

print("Training completed successfully!")


Epoch 1/5


ValueError: in user code:

    File "C:\Users\HP\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\HP\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1381, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\HP\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\HP\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 1672, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\HP\anaconda3\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file9b0oo3yi.py", line 40, in tf__run_call_with_unpacked_inputs
        raise
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_filetlosrciy.py", line 91, in tf__call
        decoder_outputs = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(decoder_input_ids),), dict(attention_mask=ag__.ld(decoder_attention_mask), encoder_hidden_states=ag__.ld(hidden_states), encoder_attention_mask=ag__.ld(attention_mask), inputs_embeds=ag__.ld(decoder_inputs_embeds), head_mask=ag__.ld(decoder_head_mask), past_key_values=ag__.ld(past_key_values), use_cache=ag__.ld(use_cache), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file9b0oo3yi.py", line 40, in tf__run_call_with_unpacked_inputs
        raise
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 65, in tf__call
        ag__.if_stmt(ag__.and_(lambda: ag__.ld(input_ids) is not None, lambda: ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 62, in else_body_2
        ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 59, in else_body_1
        ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
    File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 56, in else_body
        raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)

    ValueError: Exception encountered when calling layer 'tft5_for_conditional_generation_4' (type TFT5ForConditionalGeneration).
    
    in user code:
    
        File "C:\Users\HP\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 1395, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "C:\Users\HP\anaconda3\Lib\site-packages\transformers\models\t5\modeling_tf_t5.py", line 1455, in call  *
            decoder_outputs = self.decoder(
        File "C:\Users\HP\anaconda3\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file9b0oo3yi.py", line 40, in tf__run_call_with_unpacked_inputs
            raise
        File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 65, in tf__call
            ag__.if_stmt(ag__.and_(lambda: ag__.ld(input_ids) is not None, lambda: ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
        File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 62, in else_body_2
            ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
        File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 59, in else_body_1
            ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
        File "C:\Users\HP\AppData\Local\Temp\__autograph_generated_file_ym6xoym.py", line 56, in else_body
            raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)
    
        ValueError: Exception encountered when calling layer 'decoder' (type TFT5MainLayer).
        
        in user code:
        
            File "C:\Users\HP\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 1395, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "C:\Users\HP\anaconda3\Lib\site-packages\transformers\models\t5\modeling_tf_t5.py", line 754, in call  *
                raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
        
            ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
        
        
        Call arguments received by layer 'decoder' (type TFT5MainLayer):
          • input_ids=None
          • attention_mask=None
          • encoder_hidden_states=tf.Tensor(shape=(None, 512, 512), dtype=float32)
          • encoder_attention_mask=tf.Tensor(shape=(None, 512), dtype=int32)
          • inputs_embeds=None
          • head_mask=None
          • encoder_head_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tft5_for_conditional_generation_4' (type TFT5ForConditionalGeneration):
      • input_ids={'input_ids': 'tf.Tensor(shape=(None, 512), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 512), dtype=int32)'}
      • attention_mask=None
      • decoder_input_ids=None
      • decoder_attention_mask=None
      • head_mask=None
      • decoder_head_mask=None
      • encoder_outputs=None
      • past_key_values=None
      • inputs_embeds=None
      • decoder_inputs_embeds=None
      • labels=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True
