<a href="https://colab.research.google.com/github/Praise-Atadja/EIICD_chatbox/blob/main/EIICD_chatbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# **PROJECT NAME:**

##EARLY INTERVENTION INTELLIGENCE FOR COGNITIVE DEVELOPMENT (EIICD) CHATBOX


---

In [None]:
# Import necessary libraries
import os
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer, TFBertForQuestionAnswering
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')

# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
# Load pre-trained BERT model
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ' '.join(text.split())  # Remove extra whitespaces
    return text

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [None]:
# Function to prepare input tensors for BERT
def prepare_input_tensors(question, reference):
    quest_toks = tokenizer.tokenize(question)
    text_toks = tokenizer.tokenize(reference)
    tokens = ['[CLS]'] + quest_toks + ['[SEP]'] + text_toks + ['[SEP]']

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    mask = [1] * len(input_id)
    input_type = [0] * (1 + len(quest_toks) + 1) + [1] * (len(text_toks) + 1)

    input_id, mask, input_type = map(lambda x: tf.convert_to_tensor(x, dtype=tf.int32), (input_id, mask, input_type))

    return input_id, mask, input_type

In [None]:
# Function to create a dataset from CSV
def create_dataset_from_csv(csv_file, batch_size=16):
    data = pd.read_csv(csv_file)
    Questions = data['Questions'].apply(preprocess_text).tolist()
    Answers = data['Answers'].apply(preprocess_text).tolist()
    Patterns = data['Patterns'].apply(preprocess_text).tolist()

    input_ids, attention_masks, token_type_ids, start_positions, end_positions = [], [], [], [], []

    for Question, Answer in zip(Questions, Answers):
        input_id, attention_mask, token_type_id = prepare_input_tensors(Question, Answer)

        answer_tokens = tokenizer.tokenize(Answer)
        answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)

        start_position = tf.where(tf.equal(input_id, answer_ids[0]))[0][0].numpy()
        end_position = start_position + len(answer_ids) - 1

        input_ids.append(tf.expand_dims(input_id, 0))
        attention_masks.append(tf.expand_dims(attention_mask, 0))
        token_type_ids.append(tf.expand_dims(token_type_id, 0))
        start_positions.append(start_position)
        end_positions.append(end_position)

    dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': tf.concat(input_ids, axis=0),
        'attention_mask': tf.concat(attention_masks, axis=0),
        'token_type_ids': tf.concat(token_type_ids, axis=0)
    }, {
        'start_positions': tf.convert_to_tensor(start_positions, dtype=tf.int32),
        'end_positions': tf.convert_to_tensor(end_positions, dtype=tf.int32)
    }))

    return dataset.batch(batch_size)


In [None]:
def create_dataset_from_csv(csv_file, batch_size=16, max_length=128): # Added max_length
    data = pd.read_csv(csv_file)
    Questions = data['Questions'].apply(preprocess_text).tolist()
    Answers = data['Answers'].apply(preprocess_text).tolist()
    Patterns = data['Patterns'].apply(preprocess_text).tolist()

    input_ids, attention_masks, token_type_ids, start_positions, end_positions = [], [], [], [], []

    for Question, Answer in zip(Questions, Answers):
        # Tokenize and convert to IDs, limiting sequence length
        encoded = tokenizer.encode_plus(
            Question,
            Answer,
            max_length=max_length,  # Truncate or pad sequences
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )

        input_id = encoded['input_ids'][0]
        attention_mask = encoded['attention_mask'][0]
        token_type_id = encoded['token_type_ids'][0]

        answer_tokens = tokenizer.tokenize(Answer)
        answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)

        input_id_np = input_id.numpy().tolist()

        # Handle cases where answer is not found
        if answer_ids[0] in input_id_np:
            start_position = input_id_np.index(answer_ids[0])
            end_position = start_position + len(answer_ids) - 1
        else:
            start_position = 0
            end_position = 0

        input_ids.append(tf.expand_dims(input_id, 0))
        attention_masks.append(tf.expand_dims(attention_mask, 0))
        token_type_ids.append(tf.expand_dims(token_type_id, 0))
        start_positions.append(start_position)
        end_positions.append(end_position)

    # All tensors now have the same shape due to padding/truncation
    dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': tf.concat(input_ids, axis=0),
        'attention_mask': tf.concat(attention_masks, axis=0),
        'token_type_ids': tf.concat(token_type_ids, axis=0)
    }, {
        'start_positions': tf.convert_to_tensor(start_positions, dtype=tf.int32),
        'end_positions': tf.convert_to_tensor(end_positions, dtype=tf.int32)
    }))

    return dataset.batch(batch_size)

In [None]:
# Fine-tune the model
train_dataset = create_dataset_from_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')

# Define the loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Pass the optimizer as a string identifier
model.compile(optimizer='adam', loss=loss)  # Use 'adam' instead of optimizer object
model.fit(train_dataset, epochs=3)

# Save the fine-tuned model
model.save('fine_tuned_bert_model')

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


In [None]:
# Semantic search function using Universal Sentence Encoder
# Load Universal Sentence Encoder (USE) model
use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')

# Function for semantic search using Universal Sentence Encoder (USE)
def semantic_search(corpus, query):
    query_embedding = use_model([query])[0]
    corpus_embeddings = use_model(corpus)
    similarity = np.inner(query_embedding, corpus_embeddings)
    closest = np.argmax(similarity)
    return corpus[closest]

# Example usage of semantic search
data = pd.read_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')
corpus = data['Patterns'].apply(preprocess_text).tolist()
query = "How does autism affect sleep patterns?"

most_similar_text = semantic_search(corpus, query)
print("Most similar text:", most_similar_text)