# Installing necessary libraries

In [1]:
pip install simpletransformers

Defaulting to user installation because normal site-packages is not writeable
Collecting simpletransformers
  Obtaining dependency information for simpletransformers from https://files.pythonhosted.org/packages/e5/85/1c49e063939c70b70e615ee003ef09a8ac82030303a4f1397d0be6590b3d/simpletransformers-0.70.1-py3-none-any.whl.metadata
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.4 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.4 kB 320.0 kB/s eta 0:00:01
     ------------------------------------ - 41.0/42.4 kB 388.9 kB/s eta 0:00:01
     -------------------------------------- 42.4/42.4 kB 342.2 kB/s eta 0:00:00
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     ---------------------------------------- 0.0/43.6 kB ? eta -:--:--
     ---------------------------------------- 43.6/43.6 kB 1.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  P



# Importing all the necessary libraries

In [85]:
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import re

# Dataset loading

In [86]:
def load_csv_dataset(file_path):
    
    #loading the whole dataset
    data = pd.read_csv(file_path)
    
    #extracting the dataset by column name
    passages = data['passage'].tolist()
    questions = data['question'].tolist()
    ans_texts = data['ans_txt'].tolist()
    start_indexes = data['strt_ind'].tolist();
    
    return passages,questions,ans_texts,start_indexes

In [87]:
file_path = "C:/Users/ASUS/OneDrive/Desktop/Thesis Model/dataset/RC_Dataset.csv"
print(os.path.exists(file_path))

True


In [88]:
passages,questions,ans_texts,start_indexes = load_csv_dataset(file_path)

In [89]:
print(passages[0])
print(questions[0])
print(ans_texts[0])
print(start_indexes[0])

বাংলাদেশ একটি ক্ষুদ্র আয়তনের জনবহুল দেশ এ দেশের আয়তন ১৪৭৫৭০ বর্গকিলোমিটার এ দেশের মোট আয়তনের ১৭ শতাংশ বনভূমি বাংলাদেশের রাজধানীর নাম ঢাকা বাংলাদেশে মোট আটটি বিভাগীয় শহর রয়েছে
বাংলাদেশের আয়তন কত বর্গকিলোমিটার
 ১৪৭৫৭০
53


# Split the dataset into training dataset and testing dataset

In [90]:
training_size = 3000

In [91]:
Training_Passages = passages[0:training_size]
Testing_Passages = passages[training_size:]

Training_Questions = questions[0:training_size]
Testing_Questions = questions[training_size:]

Training_Ans = ans_texts[0:training_size]
Testing_Ans = ans_texts[training_size:]

Training_StartIndexes = start_indexes[0:training_size]
Testing_StartIndexes = start_indexes[training_size:]

# Preprocessing data

In [92]:
def preProcessing(text_list):
    preprocessed_texts = []
    
    for text in text_list:
        # Ensure the text is a string; if not, convert to string or skip
        if isinstance(text, str):
            # Remove special characters except for word characters and whitespace
            clean_text = re.sub(r'[^\w\s০-৯]', '', text)
            preprocessed_texts.append(clean_text)
        else:
            # If the element is not a string, append an empty string or handle it
            preprocessed_texts.append('')  # Or you can choose to skip this item
    
    return preprocessed_texts

In [93]:
Training_Passages = preProcessing(Training_Passages)
Training_Questions = preProcessing(Training_Questions)
Training_Ans = preProcessing(Training_Ans)
Training_StartIndexes = preProcessing(Training_StartIndexes)

# Tokenization

In [94]:
tokenizer = Tokenizer(num_words = 10000, oov_token = "<OOV>")
combined_texts = Training_Passages + Training_Questions + Training_Ans + Training_StartIndexes
tokenizer.fit_on_texts(combined_texts)
word_index = tokenizer.word_index

In [95]:
print(word_index)

{'<OOV>': 1, 'কর': 2, 'তর': 3, 'তন': 4, 'ও': 5, 'থক': 6, 'হয': 7, 'এব': 8, 'ক': 9, 'একট': 10, 'এই': 11, 'সল': 12, 'য': 13, 'ছল': 14, 'বল': 15, 'মধয': 16, 'ছলন': 17, 'পর': 18, 'হয়': 19, 'এক': 20, 'কছ': 21, 'করন': 22, 'কন': 23, 'হসব': 24, 'হযছ': 25, 'পকসতনর': 26, 'পরব': 27, 'ব': 28, 'জনয': 29, 'রবনদরনথ': 30, 'নম': 31, 'দখ': 32, 'করছলন': 33, 'আর': 34, 'শর': 35, 'ন': 36, 'ত': 37, 'কনত': 38, 'সময': 39, 'বশ': 40, 'বছর': 41, 'তদর': 42, 'সখন': 43, 'ম': 44, 'যয': 45, 'কজ': 46, 'পরথম': 47, 'নয': 48, 'তখন': 49, 'সথ': 50, 'এ': 51, 'করত': 52, 'হল': 53, 'বড': 54, 'সঙগ': 55, 'ধর': 56, 'এট': 57, 'করর': 58, 'দয': 59, 'সময়': 60, 'ঋত': 61, 'এর': 62, 'যখন': 63, 'মনষর': 64, 'যন': 65, 'কব': 66, 'মস': 67, 'নজরল': 68, 'রজনতক': 69, 'ফল': 70, 'মত': 71, 'কমপউটর': 72, 'বলর': 73, 'নয়': 74, 'সরকর': 75, 'আমদর': 76, 'মধযম': 77, 'বলদশর': 78, 'পকসতন': 79, 'বযস': 80, 'ওপর': 81, 'তনট': 82, 'যর': 83, 'চল': 84, 'আনদলন': 85, 'কসর': 86, 'সমভব': 87, 'মন': 88, 'সহরওযরদ': 89, 'ভগ': 90, 'করছ': 91, 'সলর': 92, 'রগর': 93, 'হযছল': 9