<a href="https://colab.research.google.com/github/Praise-Atadja/EIICD_chatbox/blob/main/CognitiveQuest_chatbox_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# **PROJECT NAME:**

##EARLY INTERVENTION INTELLIGENCE FOR COGNITIVE DEVELOPMENT (EIICD) CHATBOX


---

In [1]:
!pip install datasets



In [2]:
pip install transformers[torch]



In [3]:
!pip install accelerate -U



In [4]:
!pip install transformers



In [5]:
# Import necessary libraries
import os
import numpy as np
import tensorflow as tf
import nltk
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer,  DataCollatorWithPadding, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
import tensorflow_hub as hub
import pandas as pd
import re
from sklearn.metrics import classification_report
import joblib
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
def stemSentence(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

## THE DATASET

In [7]:
from datasets import load_dataset

# Load the dataset from the CSV file
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/Autism Articles/Autsim_Q&A_datatset.csv')

dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Questions', 'Answers', 'Patterns', 'Articles'],
        num_rows: 50
    })
})

In [8]:
# Split the dataset into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [9]:
# Display the number of examples in the training and validation sets
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of validation examples: {len(val_dataset)}")

Number of training examples: 40
Number of validation examples: 10


In [10]:
print(train_dataset[:5])
print(val_dataset[:5])

{'Context': ['Parents can support by creating structured routines, using visual aids, and providing consistent, positive reinforcement.', 'ABA therapy uses behavior principles to improve social, communication, and learning skills.', 'Siblings can be supported through education about autism, encouragement of open communication, and ensuring they have their own outlets for support and attention.', 'Genetics play a significant role in autism, as evidenced by higher concordance rates among identical twins and the identification of specific genetic mutations associated with autism spectrum disorder. However, not all cases of autism can be explained by genetic factors alone, suggesting a complex interplay of genetic and environmental influences.', "Autism severity levels (mild, moderate, severe) are based on the amount of support an individual requires across different domains (social communication, repetitive behaviors, sensory issues). These levels guide intervention and support strategies

In [11]:
dataset["train"][0]

{'Context': 'Parents can support by creating structured routines, using visual aids, and providing consistent, positive reinforcement.',
 'Questions': 'How can parents support a child with autism at home?',
 'Answers': "Parents can support their child with autism by establishing routines, using visual supports, promoting communication, providing sensory-friendly environments, and participating in therapies and educational activities tailored to their child's needs.",
 'Patterns': 'How do parents help autistic children at home?',
 'Articles': 'https://www.autismspeaks.org/family-support-tool-kit'}

In [12]:
import json

# Convert Dataset objects to lists of dictionaries
dataset_dict = [example for example in train_dataset]
train_data_dict = [example for example in train_dataset]
val_data_dict = [example for example in val_dataset]

# Define file paths
train_file = 'train_dataset.json'
val_file = 'val_dataset.json'
dataset_file = 'dataset.json'

# Save training dataset to JSON
with open(train_file, 'w') as f:
    json.dump(train_data_dict, f, indent=4)

# Save validation dataset to JSON
with open(val_file, 'w') as f:
    json.dump(val_data_dict, f, indent=4)

# Save dataset to JSON
with open(dataset_file, 'w') as f:
    json.dump(dataset_dict, f, indent=4)

print(f"Training dataset saved to {train_file}")
print(f"Validation dataset saved to {val_file}")

Training dataset saved to train_dataset.json
Validation dataset saved to val_dataset.json


In [13]:
print(val_dataset)

Dataset({
    features: ['Context', 'Questions', 'Answers', 'Patterns', 'Articles'],
    num_rows: 10
})


In [14]:
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and digits
    text = ' '.join(text.split())  # Remove extra whitespaces
    return text

In [15]:
from torch.utils.data import Dataset

# Load the tokenizer
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def get_unique_answers(dataset):
    """Extracts unique answers from the dataset."""
    unique_answers = set()
    for sample in dataset:
        answer = sample.get('Answers')
        if answer is not None:
            unique_answers.add(answer)
    return unique_answers

# Get unique answers from your dataset
unique_answers = get_unique_answers(train_dataset)  # Assuming train_dataset has all possible answers

# Create label mapping dynamically
label_to_id = {answer: idx for idx, answer in enumerate(unique_answers)}

def tokenize_dataset(dataset, tokenizer=tokenizer, label_to_id=label_to_id):
    input_ids = []
    attention_masks = []
    labels_list = []
    skipped_samples = 0  # Initialize a counter for skipped samples

    for sample in dataset:
        answer = sample.get('Answers')
        if answer is None:
            # Skip samples with missing 'Answers'
            print(f"Warning: 'Answers' key not found or has None value in sample: {sample}")
            skipped_samples += 1  # Increment the counter
            continue

        label = label_to_id.get(answer)
        if label is None:
            print(f"Warning: Label not found for answer '{answer}' in sample: {sample}")
            skipped_samples += 1  # Increment the counter
            continue

        tokenized_example = tokenizer(
            sample.get("Questions", ""),
            sample.get("Context", ""),  # Assuming you changed 'Patterns' to 'Context'
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        input_ids.append(tokenized_example['input_ids'].squeeze())
        attention_masks.append(tokenized_example['attention_mask'].squeeze())
        labels_list.append(label)

    if not input_ids:
        raise RuntimeError("No valid samples found. Check your dataset and label mapping.")

    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels_tensor = torch.tensor(labels_list, dtype=torch.long)

    return {'input_ids': input_ids, 'attention_mask': attention_masks, 'labels': labels_tensor, 'skipped_samples': skipped_samples}

# Tokenize the datasets
tokenized_train_datasets = tokenize_dataset(train_dataset)


print(f"Number of input_ids: {tokenized_train_datasets['input_ids'].size(0)}")
print(f"Number of attention_masks: {tokenized_train_datasets['attention_mask'].size(0)}")
print(f"Number of labels: {tokenized_train_datasets['labels'].size(0)}")
print(f"Number of skipped samples: {tokenized_train_datasets['skipped_samples']}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Number of input_ids: 40
Number of attention_masks: 40
Number of labels: 40
Number of skipped samples: 0


In [16]:
# Save tokenize datasets
save_path = 'tokenized_train_datasets'
torch.save(tokenized_train_datasets, save_path)


In [17]:
# Define MyDataset class
class MyDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.input_ids = tokenized_dataset['input_ids']
        self.attention_mask = tokenized_dataset['attention_mask']
        self.labels = tokenized_dataset['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Prepare the datasets
# Load your own tokenized dataset (example using pickle)
with open('tokenized_train_datasets', 'rb') as f:  # Use the correct filename
    tokenized_train_dataset = torch.load(f)  # Load with torch.load

final_train_dataset = MyDataset(tokenized_train_dataset)

# After creating the datasets, print their lengths
print("Length of final_train_dataset:", len(final_train_dataset))


Length of final_train_dataset: 40


EVALUATING THE MODEL

In [18]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pickle

with open('final_train_dataset', 'wb') as f:
    pickle.dump(final_train_dataset, f)

with open('final_train_dataset', 'rb') as f:
    final_train_dataset = pickle.load(f)

# Define model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Use tokenized_train_dataset instead of tokenized_datasets
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(tokenized_train_dataset['labels']))

# Define Trainer
training_args = TrainingArguments(
    output_dir='./output', # Add the output directory
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="no",
    disable_tqdm=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_train_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('final_model')
tokenizer.save_pretrained('final_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 790.2442, 'train_samples_per_second': 0.152, 'train_steps_per_second': 0.038, 'train_loss': 3.743750254313151, 'epoch': 3.0}


('final_model/tokenizer_config.json',
 'final_model/special_tokens_map.json',
 'final_model/vocab.txt',
 'final_model/added_tokens.json')

In [22]:
model_path = "/content/drive/MyDrive/EIICD_Chatbot/final_model"
tokenizer_path = "/content/drive/MyDrive/EIICD_Chatbot/final_model_tokenizer"

model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

print(f"Model saved to: {model_path}")
print(f"Tokenizer saved to: {tokenizer_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to: /content/drive/MyDrive/EIICD_Chatbot/final_model
Tokenizer saved to: /content/drive/MyDrive/EIICD_Chatbot/final_model_tokenizer


In [24]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

def compute_metrics(predictions, labels):
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }


In [25]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()

    input_ids = dataset['input_ids']
    attention_mask = dataset['attention_mask']
    labels = dataset['labels']

    predictions = []
    true_labels = []

    for i in range(len(input_ids)):
        with torch.no_grad():
            outputs = model(input_ids[i].unsqueeze(0), attention_mask=attention_mask[i].unsqueeze(0))
            logits = outputs.logits
            predictions.append(logits.numpy())
            true_labels.append(labels[i].numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.array(true_labels)

    metrics = compute_metrics(predictions, true_labels)
    return metrics


In [26]:
# Evaluate the model
metrics = evaluate_model(model, tokenizer, tokenized_train_datasets)

# Print out the metrics
print(f"Accuracy: {metrics['accuracy']}")
print(f"Precision: {metrics['precision']}")
print(f"Recall: {metrics['recall']}")
print(f"F1 Score: {metrics['f1']}")
print(f"Confusion Matrix: \n{metrics['confusion_matrix']}")

Accuracy: 0.075
Precision: 0.011388888888888888
Recall: 0.075
F1 Score: 0.018256578947368422
Confusion Matrix: 
[[0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

# Load the tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained('final_model')  # Assuming './saved_model' is where your trained model is saved

# Function to get answer from model
def get_answer(input_text):
    # Tokenize input text
    inputs = tokenizer(input_text, padding=True, truncation=True, max_length=512, return_tensors='pt')

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities using softmax
    probs = torch.softmax(outputs.logits, dim=-1)

    # Get predicted label (answer)
    predicted_label_id = torch.argmax(probs, dim=-1).item()

    # Reverse label_to_id mapping to get answer string
    id_to_label = {v: k for k, v in label_to_id.items()}
    predicted_answer = id_to_label[predicted_label_id]

    return predicted_answer

# Example usage:
input_text = "What is autism spectrum disorder?"
predicted_answer = get_answer(input_text)
print(f"Predicted answer: {predicted_answer}")


Predicted answer: Autism may impact family dynamics by requiring adjustments in parenting approaches, time management, financial resources, and social activities. Family members may experience stress but also opportunities for growth, understanding, and advocacy for their loved one with autism.


In [20]:
# Example usage:
input_text = "What is autism spectrum disorder?"
predicted_answer = get_answer(input_text)
print(f"Predicted answer: {predicted_answer}")

Predicted answer: Autism may impact family dynamics by requiring adjustments in parenting approaches, time management, financial resources, and social activities. Family members may experience stress but also opportunities for growth, understanding, and advocacy for their loved one with autism.
