In [92]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import os

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Embedding, Dense, Flatten, Conv1D, MaxPooling1D, SimpleRNN, GRU, LSTM, Input,
                                      TimeDistributed, Dropout, Bidirectional)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


In [93]:
# Install dependencies
from google.colab import files

# Upload the requirements.txt file
print("Please upload the 'requirements.txt' file.")
uploaded = files.upload()

# Install dependencies from the uploaded requirements file
for file_name in uploaded.keys():
    if file_name.endswith('.txt'):
        !pip install -r "{file_name}"

# Ensure NLTK resources are available
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


Please upload the 'requirements.txt' file.


Saving requirements.txt to requirements (4).txt
Collecting absl-py==2.1.0 (from -r requirements (4).txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting cachetools==5.3.2 (from -r requirements (4).txt (line 3))
  Using cached cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)
Collecting certifi==2023.11.17 (from -r requirements (4).txt (line 4))
  Using cached certifi-2023.11.17-py3-none-any.whl.metadata (2.2 kB)
Collecting charset-normalizer==3.3.2 (from -r requirements (4).txt (line 5))
  Using cached charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (33 kB)
Collecting contourpy==1.1.1 (from -r requirements (4).txt (line 7))
  Using cached contourpy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.9 kB)
Collecting flatbuffers==23.5.26 (from -r requirements (4).txt (line 9))
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting fonttools==4.47.2

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
# Upload dataset
print("Please upload the dataset file. ")
uploaded = files.upload()

# Load the dataset into a DataFrame
for file_name in uploaded.keys():
    if file_name.endswith('.csv'):
        df = pd.read_csv(file_name)
    elif file_name.endswith('.json'):
        with open(file_name, 'r') as f:
            data = json.load(f)
            df = pd.DataFrame(data)
df.head()


Please upload the dataset file. 


Saving mentalhealth.csv to mentalhealth (3).csv


Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl..."
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...


In [95]:
# Dynamically check for the correct column name
if 'Questions' in df.columns:
    df['cleaned_questions'] = df['Questions'].apply(clean_text)
    print("Cleaned Questions Column:")
    print(df[['Questions', 'cleaned_questions']].head())
else:
    print("The dataset does not have a 'Questions' column. Available columns are:", df.columns)
    raise KeyError("Missing 'Questions' column in the dataset.")


Cleaned Questions Column:
                                           Questions  \
0        What does it mean to have a mental illness?   
1                    Who does mental illness affect?   
3            Can people with mental illness recover?   
4  What should I do if I know someone who appears...   

                                   cleaned_questions  
0         what does it mean to have a mental illness  
1                     who does mental illness affect  
3             can people with mental illness recover  
4  what should i do if i know someone who appears...  


In [96]:
# Tokenize the cleaned questions
sequences, vocab_size, tokenizer = tokenize_data(df, 'cleaned_questions')

# Save the tokenizer
tokenizer_path = os.path.join(output_dir, "tokenizer.pkl")
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved to {tokenizer_path}")

# Pad the sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Padded Sequences:")
print(padded_sequences)


Tokenizer saved to ./output/tokenizer.pkl
Padded Sequences:
[[  3  40  49 ...   0   0   0]
 [ 67  40   6 ...   0   0   0]
 [  3  27 103 ...   0   0   0]
 ...
 [  5   2   1 ...   0   0   0]
 [  3  12  10 ...   0   0   0]
 [  3  12  10 ...   0   0   0]]


In [97]:
# Encode the answers as labels
if 'Answers' in df.columns:
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(df['Answers'])
    num_classes = len(label_encoder.classes_)
    print("Encoded Labels:", encoded_labels[:5])
    print("Number of Classes:", num_classes)
else:
    print("The dataset does not have an 'Answers' column. Available columns are:", df.columns)
    raise KeyError("Missing 'Answers' column in the dataset.")


Encoded Labels: [46 45 69 90 86]
Number of Classes: 97


In [98]:
# Build a simple neural network model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(padded_sequences, encoded_labels, epochs=10, batch_size=32, validation_split=0.2)

# Save the model
model_path = os.path.join(output_dir, "model.h5")
model.save(model_path)
print(f"Model saved to {model_path}")


Epoch 1/10




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 478ms/step - accuracy: 0.0000e+00 - loss: 4.5822 - val_accuracy: 0.0000e+00 - val_loss: 4.5886
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0169 - loss: 4.5712 - val_accuracy: 0.0000e+00 - val_loss: 4.6015
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.0000e+00 - loss: 4.5591 - val_accuracy: 0.0000e+00 - val_loss: 4.6197
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0247 - loss: 4.5492 - val_accuracy: 0.0000e+00 - val_loss: 4.6507
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0286 - loss: 4.5420 - val_accuracy: 0.0000e+00 - val_loss: 4.7028
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0286 - loss: 4.5356 - val_accuracy: 0.0000e+00 - val_loss: 4.7597
Epoch 7/10
[1m3/3[0m 



Model saved to ./output/model.h5


In [99]:
# Evaluate the model
loss, accuracy = model.evaluate(padded_sequences, encoded_labels)
print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0311 - loss: 4.5038
Model Loss: 4.536472320556641
Model Accuracy: 0.030927835032343864


In [100]:
# Save the tokenizer
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved at {tokenizer_path}")

# Save the model
model_path = os.path.join(output_dir, "chatbot_model.h5")
model.save(model_path)
print(f"Model saved at {model_path}")




Tokenizer saved at ./output/tokenizer.pkl
Model saved at ./output/chatbot_model.h5


In [101]:
# Load the tokenizer
with open(tokenizer_path, 'rb') as f:
    loaded_tokenizer = pickle.load(f)

# Load the trained model
loaded_model = load_model(model_path)
print("Model and tokenizer loaded successfully!")




Model and tokenizer loaded successfully!


In [102]:
# Function to preprocess user input and predict the class
def predict_answer(question):
    # Clean and tokenize the input question
    cleaned_question = clean_text(question)
    sequence = loaded_tokenizer.texts_to_sequences([cleaned_question])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Predict the response class
    prediction = loaded_model.predict(padded_sequence)
    predicted_label = np.argmax(prediction)

    # Map the predicted label back to the original answer
    response = label_encoder.inverse_transform([predicted_label])[0]
    return response


In [103]:
# Test the chatbot with example inputs
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    response = predict_answer(user_input)
    print(f"Chatbot: {response}")


You: "I feel stressed all the time. What should I do?"
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
Chatbot: It may be tempting to try to block out the world altogether to avoid bad news, but it's important to keep yourself informed. We all have to step up during a pandemic because we all have a part to play in reducing the spread of the virus. It's important that you know what must be done and how you should do it. This is important for the health of your neighbours and your own mental health, and taking action can help counter difficult feelings like hopelessness and despair. 
 One study from people in China found that people who had reliable up-to-date information about the coronavirus and COVID-19 illness and accurate instructions on how they should act (such as instructions around hand-washing and wearing a mask) felt more resilient and felt better able to handle the virus. People who received good, accurate information reported lower levels of stress, 