<a href="https://colab.research.google.com/github/SajjadRahati1/Index/blob/main/RQA_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retrieval Question-Answering of NLP

# Load Data

## Load from Json files

In [None]:
test3 = 'x'

In [None]:
# connect to my drive for use dataset file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load files dataset
import json
with open('/content/drive/MyDrive/Dataset/RQA/SQuAD/dev-v2.0.json', 'r') as file:
    data_dev = json.load(file)

with open('/content/drive/MyDrive/Dataset/RQA/SQuAD/train-v2.0.json', 'r') as file:
    data_train = json.load(file)

## Extract Relevant Data

In [None]:
import pandas as pd
# a function for get data and convert to a DataFrame
def convert_to_Df(data:dict):
  # Initialize lists to store the extracted data
  questions = []
  contexts = []
  answers = []

  # Parse the JSON data
  for d in data['data']:
    for parag in d['paragraphs']:
      context = parag['context']

      for qa in parag['qas']:
        question = qa['question']
        is_impossible = qa['is_impossible']

        for answer in qa['answers']:
          answer_text = answer['text']
          answer_start = answer['answer_start']

          # Append to lists
          questions.append(question)
          contexts.append(context)
          answers.append((answer_text, answer_start, is_impossible))
  # Create DataFrame
  df = pd.DataFrame({
      'question': questions,
      'context': contexts,
      'answer_text': [ans[0] for ans in answers],
      'answer_start': [ans[1] for ans in answers],
      'is_impossible': [ans[2] for ans in answers]
  })
  return df

In [None]:
df_dev = convert_to_Df(data_dev)
df_train = convert_to_Df(data_train)

In [None]:
df_train

Unnamed: 0,question,context,answer_text,answer_start,is_impossible
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,False
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,False
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,False
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,False
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,False
...,...,...,...,...,...
86816,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",Oregon,229,False
86817,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",Rangoon,414,False
86818,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",Minsk,476,False
86819,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",1975,199,False


## Preprocess the Text

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()

# Create function for tokenize dataFrames
def preprocess_df(df:pd.DataFrame):
  # Tokenize the text in the DataFrame
  df['question_tokens'] = df['question'].apply(word_tokenize)
  df['context_tokens'] = df['context'].apply(word_tokenize)
  df['answer_tokens'] = df['answer_text'].apply(word_tokenize)

  # Example of additional preprocessing (lowercasing)
  df['question_tokens'] = df['question_tokens'].apply(lambda tokens: [token.lower() for token in tokens])
  df['context_tokens'] = df['context_tokens'].apply(lambda tokens: [token.lower() for token in tokens])
  df['answer_tokens'] = df['answer_tokens'].apply(lambda tokens: [token.lower() for token in tokens])

  # Fit the tokenizer on the text data
  tokenizer.fit_on_texts(df['context_tokens'] + df['question_tokens'])

  # Convert tokens to sequences
  df['context_seq'] = tokenizer.texts_to_sequences(df['context_tokens'])
  df['question_seq'] = tokenizer.texts_to_sequences(df['question_tokens'])


  return df

def check_context_length(df_train,df_test):
  # Pad sequences to ensure consistent input length
  max_context_length_train = max(df_train['context_seq'].apply(len))
  max_context_length_test = max(df_test['context_seq'].apply(len))
  max_context_length = max(max_context_length_train, max_context_length_test)

  max_question_length_train = max(df_train['question_seq'].apply(len))
  max_question_length_test = max(df_test['question_seq'].apply(len))
  max_question_length = max(max_question_length_train, max_question_length_test)

  df_train['context_seq_padded'] = pad_sequences(df_train['context_seq'], maxlen=max_context_length, padding='post').tolist()
  df_test['context_seq_padded'] = pad_sequences(df_test['context_seq'], maxlen=max_context_length, padding='post').tolist()

  df_train['question_seq_padded'] = pad_sequences(df_train['question_seq'], maxlen=max_question_length, padding='post').tolist()
  df_test['question_seq_padded'] = pad_sequences(df_test['question_seq'], maxlen=max_question_length, padding='post').tolist()

  return df_train, df_test

In [None]:
df_dev = preprocess_df(df_dev)
df_train = preprocess_df(df_train)
df_train, df_dev = check_context_length(df_train, df_dev)

# Create model with LSTM

## Prepare the Data for the Model

In [None]:
import numpy as np
def prepare_data(df):
  # Prepare input arrays
  X_context = np.array(df['context_seq_padded'].tolist())
  X_question = np.array(df['question_seq_padded'].tolist())
  y_start = np.array(df['answer_start'].tolist())
  return (X_context, X_question, y_start)

In [None]:
X_context_train, X_question_train, y_start_train = prepare_data(df_train)
X_context_val, X_question_val, y_start_val = prepare_data(df_dev)

## Define the LSTM Model

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

In [None]:
# Hyperparameters
embedding_dim = 128
lstm_units = 64
# باتوجه به پدینگی که داده ایم پس مکس برای ما برابر با همین مقدار سایز این ستون است
max_context_length = len(df_train['context_seq_padded'][0])
max_question_length = len(df_train['question_seq_padded'][0])

In [None]:
max_context_length,max_question_length

(766, 60)

In [None]:
# Input layers
context_input = Input(shape=(max_context_length,), name='context_input')
question_input = Input(shape=(max_question_length,), name='question_input')

In [None]:
# Embedding layers
embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)

context_embedding = embedding(context_input)
question_embedding = embedding(question_input)

In [None]:
# LSTM layers
context_lstm = LSTM(lstm_units, return_sequences=False)(context_embedding)
question_lstm = LSTM(lstm_units, return_sequences=False)(question_embedding)

In [None]:
# Concatenate the outputs of the LSTM layers
merged = Concatenate()([context_lstm, question_lstm])

In [None]:
# Dense layers for prediction
dense = Dense(128, activation='relu')(merged)
output = Dense(1, activation='linear')(dense)  # Predicting the start index of the answer

In [None]:
# Define the model
model = Model(inputs=[context_input, question_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 context_input (InputLayer)  [(None, 766)]                0         []                            
                                                                                                  
 question_input (InputLayer  [(None, 60)]                 0         []                            
 )                                                                                                
                                                                                                  
 embedding (Embedding)       multiple                     1360179   ['context_input[0][0]',       
                                                          2          'question_input[0][0]']      
                                                                                              

## Train the Model

In [None]:
# Train the model
history = model.fit(
    [X_context_train, X_question_train],
    y_start_train,
    validation_data=([X_context_val, X_question_val], y_start_val),
    epochs=5,
    batch_size=32
)
#یک بار اینو زدم فعلا دیگه ترین نمیکنم بریم بعدی

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

## Evaluate the Model

In [None]:
lstm_model = model
lstm_history = history

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate([X_context_val, X_question_val], y_start_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Validation Loss: 86175.734375
Validation Accuracy: 0.0


In [None]:
import pickle
base_address_file = '/content/drive/MyDrive/Dataset/RQA/SQuAD/'
# Save the entire model to a file
model.save(base_address_file + 'qa_model.h5')

# Save the history to a file
with open(base_address_file + 'training_history.pkl', 'wb') as file:
    pickle.dump(history.history, file)

  saving_api.save_model(


In [None]:
import pickle
base_address_file = '/content/drive/MyDrive/Dataset/RQA/SQuAD/'

In [None]:
from google.colab import files

# # Download the model file
# files.download('qa_model.h5')

# # Download the training history file
# files.download('training_history.pkl')

# Create model with Transformer

## Install Required Libraries

In [None]:
!pip install transformers
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

## Prepare Data for Transformer

In [None]:
df_train['context'][0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

## Define the Transformer Model

In [None]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, Dense, LayerNormalization, Dropout, Concatenate
from tensorflow.keras.models import Model

In [None]:
embedding_dim = 128
num_heads = 8
ff_dim = 128
dropout_rate = 0.1

## Create Transformer block

In [None]:
# Transformer block
def transformer_block(inputs, num_heads, ff_dim, dropout_rate):
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=embedding_dim
    )(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output)

    # Feed-forward network
    ff_output = Dense(ff_dim, activation='relu')(attention_output)
    ff_output = Dense(embedding_dim)(ff_output)
    ff_output = Dropout(dropout_rate)(ff_output)
    ff_output = LayerNormalization(epsilon=1e-6)(ff_output)

    return ff_output

In [None]:
# Apply Transformer block to both context and question embeddings
context_transformer = transformer_block(context_embedding, num_heads, ff_dim, dropout_rate)
question_transformer = transformer_block(question_embedding, num_heads, ff_dim, dropout_rate)

In [None]:
# Reduce the dimensionality by taking the mean of all tokens (pooling)
context_pooled = tf.reduce_mean(context_transformer, axis=1)
question_pooled = tf.reduce_mean(question_transformer, axis=1)

In [None]:
# Concatenate the outputs of the Transformer blocks
merged = Concatenate()([context_pooled, question_pooled])

# Dense layers for prediction
# dense = Dense(128, activation='relu')(merged)
dense = Dense(64, activation='relu')(merged)
output = Dense(1, activation='linear')(dense)  # Predicting the start index of the answer


In [None]:
# Define the model
model_transformer = Model(inputs=[context_input, question_input], outputs=output)
model_transformer.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
model_transformer.summary()

## Train the Model

In [None]:

# Train the model
history_transformer = model_transformer.fit(
    [X_context_train, X_question_train],
    y_start_train,
    validation_data=([X_context_val, X_question_val], y_start_val),
    epochs=5,
    batch_size=32
)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 context_input (InputLayer)  [(None, 766)]                0         []                            
                                                                                                  
 question_input (InputLayer  [(None, 60)]                 0         []                            
 )                                                                                                
                                                                                                  
 embedding (Embedding)       multiple                     1360179   ['context_input[0][0]',       
                                                          2          'question_input[0][0]']      
                                                                                            

## Save the Model

In [None]:
# Save the entire model to a file
model_transformer.save_pretrained(base_address_file + 'qa_transformer_model.h5')
# Save the history to a file
with open(base_address_file + 'training_transformer_history.pkl', 'wb') as file:
    pickle.dump(history_transformer.history, file)

# بررسی دو مدل با معیار های ارزیابی

## Define Evaluation Functions

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Function to compute the Exact Match (EM) score
def exact_match(pred, true):
    return int(pred == true)

# Function to compute the F1 score
def f1_score_metric(pred, true):
    pred_tokens = tokenizer.tokenize(pred)
    true_tokens = tokenizer.tokenize(true)
    common = set(pred_tokens) & set(true_tokens)
    if len(common) == 0:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(true_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Function to evaluate the model
def evaluate_model(model, df_dev, tokenizer):
    exact_matches = []
    f1_scores = []

    for i, row in df_dev.iterrows():
        context = row['context']
        question = row['question']
        true_answer = row['answer_text']

        # Tokenize inputs
        inputs = tokenizer(
            context,
            question,
            truncation='only_first',
            padding='max_length',
            max_length=512,
            return_tensors='tf'
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        # Predict the start position of the answer
        start_logits = model.predict([input_ids, attention_mask, token_type_ids])[0]
        start_index = np.argmax(start_logits)

        # Decode the predicted answer
        pred_answer = tokenizer.decode(input_ids[0][start_index:start_index+10], skip_special_tokens=True)

        # Calculate metrics
        exact_matches.append(exact_match(pred_answer, true_answer))
        f1_scores.append(f1_score_metric(pred_answer, true_answer))

    em_score = np.mean(exact_matches)
    f1_score_avg = np.mean(f1_scores)

    return em_score, f1_score_avg


## Evaluate the LSTM Model

In [None]:
# Convert dev data for LSTM model
X_context_dev = np.array(df_dev['context_seq_padded'].tolist())
X_question_dev = np.array(df_dev['question_seq_padded'].tolist())
y_start_dev = np.array(df_dev['answer_start'].tolist())

# Predict the start positions
start_preds = model_lstm.predict([X_context_dev, X_question_dev])

# Calculate evaluation metrics
exact_matches_lstm = []
f1_scores_lstm = []

for i, pred in enumerate(start_preds):
    start_index_pred = np.argmax(pred)
    true_answer = df_dev.iloc[i]['answer_text']
    context_tokens = df_dev.iloc[i]['context_tokens']
    pred_answer_tokens = context_tokens[start_index_pred:start_index_pred+len(tokenizer.tokenize(true_answer))]
    pred_answer = " ".join(pred_answer_tokens)

    exact_matches_lstm.append(exact_match(pred_answer, true_answer))
    f1_scores_lstm.append(f1_score_metric(pred_answer, true_answer))

em_score_lstm = np.mean(exact_matches_lstm)
f1_score_avg_lstm = np.mean(f1_scores_lstm)

print(f'LSTM Model - Exact Match (EM) Score: {em_score_lstm}')
print(f'LSTM Model - F1 Score: {f1_score_avg_lstm}')


## Evaluate the Transformer Model

In [None]:
# Convert dev data for Transformer model
X_context_dev = np.array(df_dev['context_seq_padded'].tolist())
X_question_dev = np.array(df_dev['question_seq_padded'].tolist())
y_start_dev = np.array(df_dev['answer_start'].tolist())

# Predict the start positions
start_preds = model_transformer.predict([X_context_dev, X_question_dev])

# Calculate evaluation metrics
exact_matches_transformer = []
f1_scores_transformer = []

for i, pred in enumerate(start_preds):
    start_index_pred = np.argmax(pred)
    true_answer = df_dev.iloc[i]['answer_text']
    context_tokens = df_dev.iloc[i]['context_tokens']
    pred_answer_tokens = context_tokens[start_index_pred:start_index_pred+len(tokenizer.tokenize(true_answer))]
    pred_answer = " ".join(pred_answer_tokens)

    exact_matches_transformer.append(exact_match(pred_answer, true_answer))
    f1_scores_transformer.append(f1_score_metric(pred_answer, true_answer))

em_score_transformer = np.mean(exact_matches_transformer)
f1_score_avg_transformer = np.mean(f1_scores_transformer)

print(f'Transformer Model - Exact Match (EM) Score: {em_score_transformer}')
print(f'Transformer Model - F1 Score: {f1_score_avg_transformer}')

In [None]:
# # Evaluate BERT model
# em_score_bert, f1_score_avg_bert = evaluate_model(model_bert, df_dev, tokenizer)

# print(f'BERT Model - Exact Match (EM) Score: {em_score_bert}')
# print(f'BERT Model - F1 Score: {f1_score_avg_bert}')

## show with plt

In [None]:
import matplotlib.pyplot as plt

# Define the scores
metrics = ['Exact Match (EM)', 'F1 Score']
lstm_scores = [em_score_lstm, f1_score_avg_lstm]
transformer_scores = [em_score_transformer, f1_score_avg_transformer]

# Define the position of the bars on the x-axis
x = np.arange(len(metrics))

# Define the width of the bars
width = 0.35

# Create the bar chart
fig, ax = plt.subplots()

bars1 = ax.bar(x - width/2, lstm_scores, width, label='LSTM Model')
bars2 = ax.bar(x + width/2, transformer_scores, width, label='Transformer Model')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('Performance Comparison of LSTM and Transformer Models')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Attach a text label above each bar in *bars*, displaying its height
def autolabel(bars):
    """Attach a text label above each bar in *bars*, displaying its height."""
    for bar in bars:
        height = bar.get_height()
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

fig.tight_layout()

# Display the plot
plt.show()
