In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split


In [None]:
# Load the dataset
data_path = "/Kiran-deppression.csv"
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Questions,Answers
0,There are issues from my past that have me ver...,Have you explained to your prospective partner...
1,I just took a job that requires me to travel f...,"Congratulations on your new job. Chances are, ..."
2,cheerful,Did something happen which made you feel this ...
3,I feel that I need to end my present relations...,It sounds like you have some insight into the ...
4,"I keep being mean to my best friend, and I don...","Hi Morristown,There's a saying that goes ""hurt..."


In [None]:
# Check for missing values
df.isnull().sum()

# Drop any rows with missing values
df.dropna(inplace=True)

# Convert text to lowercase
df['Questions'] = df['Questions'].str.lower()
df['Answers'] = df['Answers'].str.lower()

# Display the first few rows after preprocessing
df.head()

Unnamed: 0,Questions,Answers
0,there are issues from my past that have me ver...,have you explained to your prospective partner...
1,i just took a job that requires me to travel f...,"congratulations on your new job. chances are, ..."
2,cheerful,did something happen which made you feel this ...
3,i feel that i need to end my present relations...,it sounds like you have some insight into the ...
4,"i keep being mean to my best friend, and i don...","hi morristown,there's a saying that goes ""hurt..."


In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])

# Convert text to sequences
questions_sequences = tokenizer.texts_to_sequences(df['Questions'])
answers_sequences = tokenizer.texts_to_sequences(df['Answers'])

# Pad sequences to ensure uniform length
max_sequence_length = max(max(len(seq) for seq in questions_sequences), max(len(seq) for seq in answers_sequences))
questions_padded = pad_sequences(questions_sequences, maxlen=max_sequence_length, padding='post')
answers_padded = pad_sequences(answers_sequences, maxlen=max_sequence_length, padding='post')

# Display the shape of the padded sequences
print("Questions padded shape:", questions_padded.shape)
print("Answers padded shape:", answers_padded.shape)

Questions padded shape: (4262, 1321)
Answers padded shape: (4262, 1321)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(questions_padded, answers_padded, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3409, 1321)
X_test shape: (853, 1321)
y_train shape: (3409, 1321)
y_test shape: (853, 1321)


In [None]:
# Define the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
lstm_units = 512

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(lstm_units, return_sequences=True),
    Dropout(0.2),
    LSTM(lstm_units, return_sequences=True),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()



In [None]:
# Train the model
epochs = 10
batch_size = 32

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test)
)

Epoch 1/10
[1m 15/107[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:42:49[0m 67s/step - accuracy: 0.6937 - loss: 4.6677

traning in progress current epoch 11/107 of 1/10 at 3.15am

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [None]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='post')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example usage
seed_text = "I feel so alone"
next_words = 10
generated_text = generate_text(seed_text, next_words, max_sequence_length)
print(generated_text)

In [None]:
# Save the model
model.save('mental_health_consultant_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model('mental_health_consultant_model.h5')

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)