In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import os

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Embedding, Dense, Flatten, Conv1D, MaxPooling1D, SimpleRNN, GRU, LSTM, Input,
                                      TimeDistributed, Dropout, Bidirectional)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


In [2]:
# Install dependencies
from google.colab import files

# Upload the requirements.txt file
print("Please upload the 'requirements.txt' file.")
uploaded = files.upload()

# Install dependencies from the uploaded requirements file
for file_name in uploaded.keys():
    if file_name.endswith('.txt'):
        !pip install -r "{file_name}"

# Ensure NLTK resources are available
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


Please upload the 'requirements.txt' file.


Saving requirements.txt to requirements (1).txt
Collecting absl-py==2.1.0 (from -r requirements (1).txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting cachetools==5.3.2 (from -r requirements (1).txt (line 3))
  Using cached cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)
Collecting certifi==2023.11.17 (from -r requirements (1).txt (line 4))
  Using cached certifi-2023.11.17-py3-none-any.whl.metadata (2.2 kB)
Collecting charset-normalizer==3.3.2 (from -r requirements (1).txt (line 5))
  Using cached charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (33 kB)
Collecting contourpy==1.1.1 (from -r requirements (1).txt (line 7))
  Using cached contourpy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.9 kB)
Collecting flatbuffers==23.5.26 (from -r requirements (1).txt (line 9))
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting fonttools==4.47.2

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Upload dataset
print("Please upload the dataset file. ")
uploaded = files.upload()

# Load the dataset into a DataFrame
for file_name in uploaded.keys():
    if file_name.endswith('.csv'):
        df = pd.read_csv(file_name)
    elif file_name.endswith('.json'):
        with open(file_name, 'r') as f:
            data = json.load(f)
            df = pd.DataFrame(data)
df.head()


Please upload the dataset file. 


Saving mentalhealth.csv to mentalhealth (1).csv


Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl..."
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...


In [4]:
import re

# Define the clean_text function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:
# Dynamically check for the correct column name
if 'Questions' in df.columns:
    df['cleaned_questions'] = df['Questions'].apply(clean_text)
    print("Cleaned Questions Column:")
    print(df[['Questions', 'cleaned_questions']].head())
else:
    print("The dataset does not have a 'Questions' column. Available columns are:", df.columns)
    raise KeyError("Missing 'Questions' column in the dataset.")


Cleaned Questions Column:
                                           Questions  \
0        What does it mean to have a mental illness?   
1                    Who does mental illness affect?   
3            Can people with mental illness recover?   
4  What should I do if I know someone who appears...   

                                   cleaned_questions  
0         what does it mean to have a mental illness  
1                     who does mental illness affect  
3             can people with mental illness recover  
4  what should i do if i know someone who appears...  


In [6]:
import os
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Define the tokenize_data function
def tokenize_data(df, column_name, num_words=10000):

    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(df[column_name])
    sequences = tokenizer.texts_to_sequences(df[column_name])
    vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
    return sequences, vocab_size, tokenizer

# Tokenize the cleaned questions first
sequences, vocab_size, tokenizer = tokenize_data(df, 'cleaned_questions')

# Now you can save the tokenizer because it has been defined
output_dir = "./tokenizer_output"  # Directory to save the tokenizer
os.makedirs(output_dir, exist_ok=True)
tokenizer_path = os.path.join(output_dir, "tokenizer.pkl")
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved to {tokenizer_path}")

# Pad the sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Padded Sequences:")
print(padded_sequences)

Tokenizer saved to ./tokenizer_output/tokenizer.pkl
Padded Sequences:
[[  4  41  50 ...   0   0   0]
 [ 68  41   7 ...   0   0   0]
 [  4  28 104 ...   0   0   0]
 ...
 [  6   3   2 ...   0   0   0]
 [  4  13  11 ...   0   0   0]
 [  4  13  11 ...   0   0   0]]


In [7]:
# Encode the answers as labels
if 'Answers' in df.columns:
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(df['Answers'])
    num_classes = len(label_encoder.classes_)
    print("Encoded Labels:", encoded_labels[:5])
    print("Number of Classes:", num_classes)
else:
    print("The dataset does not have an 'Answers' column. Available columns are:", df.columns)
    raise KeyError("Missing 'Answers' column in the dataset.")


Encoded Labels: [46 45 69 90 86]
Number of Classes: 97


In [8]:
# Build a simple neural network model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(padded_sequences, encoded_labels, epochs=20, batch_size=32, validation_split=0.2)

# Save the model
model_path = os.path.join(output_dir, "model.h5")
model.save(model_path)
print(f"Model saved to {model_path}")


Epoch 1/20




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 253ms/step - accuracy: 0.0104 - loss: 4.5765 - val_accuracy: 0.0000e+00 - val_loss: 4.5998
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0858 - loss: 4.5592 - val_accuracy: 0.0000e+00 - val_loss: 4.6308
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0000e+00 - loss: 4.5633 - val_accuracy: 0.0000e+00 - val_loss: 4.6692
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.0247 - loss: 4.5455 - val_accuracy: 0.0000e+00 - val_loss: 4.7030
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0208 - loss: 4.5121 - val_accuracy: 0.0000e+00 - val_loss: 4.7559
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.0572 - loss: 4.4966 - val_accuracy: 0.0000e+00 - val_loss: 4.8226
Epoch 7/20
[1m3/3[0m [32



Model saved to ./tokenizer_output/model.h5


In [9]:
# Evaluate the model
loss, accuracy = model.evaluate(padded_sequences, encoded_labels)
print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0809 - loss: 4.2199 
Model Loss: 4.343997955322266
Model Accuracy: 0.09278350323438644


In [10]:
# Save the tokenizer
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved at {tokenizer_path}")

# Save the model
model_path = os.path.join(output_dir, "chatbot_model.h5")
model.save(model_path)
print(f"Model saved at {model_path}")




Tokenizer saved at ./tokenizer_output/tokenizer.pkl
Model saved at ./tokenizer_output/chatbot_model.h5


In [11]:
# Load the tokenizer
with open(tokenizer_path, 'rb') as f:
    loaded_tokenizer = pickle.load(f)

# Load the trained model
loaded_model = load_model(model_path)
print("Model and tokenizer loaded successfully!")




Model and tokenizer loaded successfully!


In [12]:
# Function to preprocess user input and predict the class
def predict_answer(question):
    # Clean and tokenize the input question
    cleaned_question = clean_text(question)
    sequence = loaded_tokenizer.texts_to_sequences([cleaned_question])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Predict the response class
    prediction = loaded_model.predict(padded_sequence)
    predicted_label = np.argmax(prediction)

    # Map the predicted label back to the original answer
    response = label_encoder.inverse_transform([predicted_label])[0]
    return response


In [13]:
# Test the chatbot with example inputs
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    response = predict_answer(user_input)
    print(f"Chatbot: {response}")


You: What does it mean to have a mental illness?	
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
Chatbot: Similar to a medical advance directive or a health care power of attorney, a psychiatric advance directive is a legal document completed in a time of wellness that provides instructions regarding treatment or services one wishes to have or not have during a mental health crisis, and may help influence his or her care.
You: Who does mental illness affect?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Chatbot: There are a lot of things you can't control. You can't control what happens next, how governments respond, or how your neighbours react to the pandemic. What you can do is make a plan and decide how you'll manage the things you do control, like your ability to stay safe, follow public health measures, stay connected with loved ones, and take care of your mental and physical health. 
 Events like a pandemic change a lot over ti