In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv("Combined_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
df["status"].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,16351
Depression,15404
Suicidal,10653
Anxiety,3888
Bipolar,2877
Stress,2669
Personality disorder,1201


In [None]:
df = df.dropna()
len(df)

52681

In [None]:
texts = df['statement'].values
labels = df['status'].values

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(texts,labels,test_size=0.2, random_state=42)

# Tokenize and Pad the sequences

# parameters for tokenization
max_words = 20000 # max num of words to keep, based on word frequency
max_len = 1000 # max length of all sequences

# Tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# convert texts to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
# Build the BiLSTM model
# model parameters
embedding_dim = 128  # embedding layer output dimension

# Build the model
model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=max_len))
model.add(Bidirectional(LSTM(64,return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(16,activation='relu'))
model.add(Dense(7,activation='softmax'))   # 7 categories in total



In [None]:
# compile the model
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# train the model
history = model.fit(X_train_padded,y_train,epochs=20,batch_size=128, validation_split=0.2)

Epoch 1/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 176ms/step - accuracy: 0.4626 - loss: 1.3713 - val_accuracy: 0.5543 - val_loss: 1.0229
Epoch 2/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 168ms/step - accuracy: 0.6232 - loss: 0.8497 - val_accuracy: 0.6608 - val_loss: 0.7987
Epoch 3/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 169ms/step - accuracy: 0.6998 - loss: 0.7068 - val_accuracy: 0.6943 - val_loss: 0.7777
Epoch 4/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 169ms/step - accuracy: 0.7537 - loss: 0.6142 - val_accuracy: 0.7197 - val_loss: 0.7375
Epoch 5/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 169ms/step - accuracy: 0.7947 - loss: 0.5508 - val_accuracy: 0.7173 - val_loss: 0.7826
Epoch 6/20
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 171ms/step - accuracy: 0.8259 - loss: 0.4923 - val_accuracy: 0.7431 - val_loss: 0.7468
Epoch 7/20

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 50ms/step - accuracy: 0.7466 - loss: 1.2210
Test Accuracy: 0.7458


In [None]:
# save the model
model.save('colab_model_sentiment.h5')



In [None]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)
    predicted_label = label_encoder.inverse_transform(predicted_class)
    return predicted_label[0]

In [None]:
input_text = "I feel like nervous"
predicted_sentiment = predict_sentiment(input_text)
print(f"Predicted Sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Predicted Sentiment: Anxiety


# Test model

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

In [None]:
labels = df['status'].values

In [None]:
# Tokenize and Pad the sequences

# parameters for tokenization
max_words = 20000 # max num of words to keep, based on word frequency
max_len = 1000 # max length of all sequences

# Tokenizer
tokenizer = Tokenizer(num_words=max_words)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


In [None]:
labels

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
model = load_model('colab_model_sentiment.h5')



In [None]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)
    predicted_label = label_encoder.inverse_transform(predicted_class)
    return predicted_label[0]

In [None]:
input_text = "I feel happy "
predicted_sentiment = predict_sentiment(input_text)
print(f"Predicted Sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Predicted Sentiment: Normal


# Metrics

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# Train the model with best parameters
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
# Define the parameter grid
param_grid = {
    'alpha':[0.1,0.5,1.0,5.0,10.0]
}

grid_search = GridSearchCV(estimator=bnb, param_grid=param_grid,cv=5,scoring='accuracy')

grid_search.fit(X_train_padded,y_train)

In [None]:
from sklearn.metrics import f1_score, classification_report
import numpy as np

# Sample true labels (ground truth) and predicted labels
y_true = np.array([0, 1, 2, 2, 0, 1, 2, 1, 0, 2, 1])  # True labels
y_pred = np.array([0, 1, 2, 0, 0, 1, 1, 1, 0, 2, 2])  # Predicted labels

# Compute Weighted-Averaged F1-Score
weighted_f1 = f1_score(y_true, y_pred, average='weighted')

# Print Weighted F1-Score
print(f"Weighted-Averaged F1-Score: {weighted_f1:.4f}")

# Optional: Generate a classification report for more detailed metrics
report = classification_report(y_true, y_pred, target_names=["Normal", "Suicidal", "Stress"])
print("\nClassification Report:")
print(report)


Weighted-Averaged F1-Score: 0.7143

Classification Report:
              precision    recall  f1-score   support

      Normal       0.75      1.00      0.86         3
    Suicidal       0.75      0.75      0.75         4
      Stress       0.67      0.50      0.57         4

    accuracy                           0.73        11
   macro avg       0.72      0.75      0.73        11
weighted avg       0.72      0.73      0.71        11

