<a href="https://colab.research.google.com/github/Praise-Atadja/EIICD_chatbox/blob/main/EIICD_chatbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# **PROJECT NAME:**

##EARLY INTERVENTION INTELLIGENCE FOR COGNITIVE DEVELOPMENT (EIICD) CHATBOX


---

In [53]:
# Import necessary libraries
import os
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer, TFBertForQuestionAnswering
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')

In [55]:
# Load pre-trained BERT model
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ' '.join(text.split())  # Remove extra whitespaces
    return text

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [56]:
# Function to prepare input tensors for BERT
def prepare_input_tensors(question, reference):
    quest_toks = tokenizer.tokenize(question)
    text_toks = tokenizer.tokenize(reference)
    tokens = ['[CLS]'] + quest_toks + ['[SEP]'] + text_toks + ['[SEP]']

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    mask = [1] * len(input_id)
    input_type = [0] * (1 + len(quest_toks) + 1) + [1] * (len(text_toks) + 1)

    input_id, mask, input_type = map(lambda x: tf.convert_to_tensor(x, dtype=tf.int32), (input_id, mask, input_type))

    return input_id, mask, input_type

In [57]:
def create_dataset_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    texts = df['Questions'].tolist()
    print("Unique labels:", df['Answers'].unique())
    num_labels = len(df['Answers'].unique())
    labels = pd.factorize(df['Answers'])[0].tolist()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='tf')

    # Splitting the data
    train_indices, val_indices = train_test_split(
        range(len(labels)), test_size=0.2, random_state=42
    )

    # Use the indices to extract the data
    train_encodings = {
        'input_ids': tf.gather(encodings['input_ids'], train_indices),
        'attention_mask': tf.gather(encodings['attention_mask'], train_indices),
        'token_type_ids': tf.gather(encodings['token_type_ids'], train_indices)
    }
    val_encodings = {
        'input_ids': tf.gather(encodings['input_ids'], val_indices),
        'attention_mask': tf.gather(encodings['attention_mask'], val_indices),
        'token_type_ids': tf.gather(encodings['token_type_ids'], val_indices)
    }
    train_labels = tf.gather(labels, train_indices)
    val_labels = tf.gather(labels, val_indices)

    # Create TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((
        train_encodings,
        train_labels
    )).batch(8)

    val_dataset = tf.data.Dataset.from_tensor_slices((
        val_encodings,
        val_labels
    )).batch(8)

    return train_dataset, val_dataset, num_labels


def save_dataset(dataset, path):
    tf.data.experimental.save(dataset, path)
    print(f"Dataset saved to {path}")

def load_dataset(path):
    dataset = tf.data.experimental.load(path)
    print(f"Dataset loaded from {path}")
    return dataset

# Load the dataset and get the number of labels
train_dataset, val_dataset, num_labels = create_dataset_from_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')

# Save the datasets
save_dataset(train_dataset, '/content/drive/MyDrive/train_dataset')
save_dataset(val_dataset, '/content/drive/MyDrive/val_dataset')

# Load the datasets
train_dataset = load_dataset('/content/drive/MyDrive/train_dataset')
val_dataset = load_dataset('/content/drive/MyDrive/val_dataset')

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Define the model with the correct number of labels
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define the loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model using string identifier for optimizer
optimizer = Adam(learning_rate=2e-5)
model.compile(optimizer='adam', loss=loss)

# Print the model summary
model.summary()

# Train the model with detailed logging
history = model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Save the fine-tuned model
model.save('fine_tuned_bert_model')

# Print the training history
print(history.history)

Unique labels: ['Autism spectrum disorder (ASD) is a developmental disorder that affects communication, behavior, and social interaction. It includes a wide range of conditions characterized by challenges with social skills, repetitive behaviors, speech, and nonverbal communication.'
 'Early signs may include reduced eye contact, lack of response to name, delayed speech or language development, limited or no social smiling, and lack of gestures such as pointing or waving.'
 "Autism is diagnosed through a combination of behavioral assessments, developmental screenings, parental interviews, and observations of the child's behavior and communication skills. There is no single medical test for autism diagnosis."
 'The exact causes of autism are not fully understood, but research suggests a combination of genetic and environmental factors. Genetic predisposition, prenatal factors (such as advanced parental age or maternal illness), and certain genetic mutations may contribute to autism.'
 '

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_639 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  38450     
                                                                 
Total params: 109520690 (417.79 MB)
Trainable params: 109520690 (417.79 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'loss': [4.071023464202881, 4.16321325302124, 4.140284538269043], 'val_loss': [3.9890975952148438, 3.763047695159912, 6.094704627990723]}


In [58]:
# Semantic search function using Universal Sentence Encoder
# Load Universal Sentence Encoder (USE) model
use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')

# Function for semantic search using Universal Sentence Encoder (USE)
def semantic_search(corpus, query):
    query_embedding = use_model([query])[0]
    corpus_embeddings = use_model(corpus)
    similarity = np.inner(query_embedding, corpus_embeddings)
    closest = np.argmax(similarity)
    return corpus[closest]

# Example usage of semantic search
data = pd.read_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv')
corpus = data['Patterns'].apply(preprocess_text).tolist()
query = "How does autism affect sleep patterns?"

most_similar_text = semantic_search(corpus, query)
print("Most similar text:", most_similar_text)

Most similar text: how does autism impact sleep?


In [64]:
def evaluate_model(model, val_dataset, df):
    predictions = []
    labels = []

    for batch in val_dataset:
        inputs, batch_labels = batch
        batch_predictions = model(inputs)
        batch_predictions = tf.argmax(batch_predictions.logits, axis=-1).numpy()

        predictions.extend(batch_predictions)
        labels.extend(batch_labels.numpy())

    # Get ALL unique labels from the DataFrame (in the original order)
    target_names = df['Answers'].unique()

    # Filter predictions and labels to include only classes present in the validation set
    unique_labels_in_val = np.unique(labels)  # Define unique_labels_in_val
    valid_indices = np.isin(predictions, unique_labels_in_val)
    predictions = np.array(predictions)[valid_indices]
    labels = np.array(labels)[valid_indices]

    # Only include target names for the unique labels present in the validation set
    target_names_in_val = target_names[unique_labels_in_val]

    report = classification_report(labels, predictions, target_names=target_names_in_val)
    print(report)

def predict(model, question, df): # Add df as argument
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(question, return_tensors='tf', truncation=True, padding=True)
    outputs = model(inputs)
    prediction = tf.argmax(outputs.logits, axis=-1).numpy()

    # Use df to get the answer
    answer = df['Answers'].unique()[prediction[0]]
    return answer

def chatbot_interface(model, df): # Add model and df as arguments
    print("Welcome to the Autism Support Chatbot!")
    print("You can ask me any questions related to autism support.")
    print("Type 'exit' to end the conversation.")

    while True:
        question = input("You: ")
        if question.lower() == 'exit':
            print("Goodbye!")
            break

        answer = predict(model, question, df) # Pass df to predict
        print("Chatbot:", answer)

# Example mock-up test dataset creation
val_dataset = load_dataset('/content/drive/MyDrive/val_dataset')
df = pd.read_csv('/content/drive/MyDrive/AutsimChatbox_datatset.csv') # Load the DataFrame

# Evaluate the model, passing the DataFrame
evaluate_model(model, val_dataset, df)

# Run the chatbot interface, passing the model and DataFrame
chatbot_interface(model, df)

Dataset loaded from /content/drive/MyDrive/val_dataset


ValueError: Number of classes, 0, does not match size of target_names, 10. Try specifying the labels parameter