In [1]:
import pandas as pd
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk#for text wrangling
import spacy#for text wrangling
import re
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim


In [2]:
age = 'age'
s = 'statement'
status = 'status'

In [3]:
df_old = pd.read_csv("sentiments.csv")
df_old.isnull().sum()

Unnamed: 0      0
statement     362
status          0
dtype: int64

In [4]:
#so we have 362 entries where the statements are null. Now as these are text values their imputation is not possible
df_new = df_old.copy()
df_new.dropna(inplace=True)

In [5]:
df_new['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [6]:
#now as we have to train the model on the textual data ie the statements but well we can not input words directly to our model
#for that firstly we will do some preprocessing on the text and then convert the words into some vectors.


In [12]:
X = df_new['statement']#this will be the input to the model
Y = df_new['status']#this is the labels for all our inputs
type(X)

pandas.core.series.Series

In [13]:
#text wrangling
#step 1 lowering all the words to ensure same words with different cases are not considered different
X = X.str.lower()#converted the data to string and then applied lower case conversion method
X

0                                               oh my gosh
1        trouble sleeping, confused mind, restless hear...
2        all wrong, back off dear, forward doubt. stay ...
3        i've shifted my focus to something else but i'...
4        i'm restless and restless, it's been a month n...
                               ...                        
53038    nobody takes me seriously i’ve (24m) dealt wit...
53039    selfishness  "i don't feel very good, it's lik...
53040    is there any way to sleep better? i can't slee...
53041    public speaking tips? hi, all. i have to give ...
53042    i have really bad door anxiety! it's not about...
Name: statement, Length: 52681, dtype: object

In [14]:
Y = Y.str.lower()
Y

0        anxiety
1        anxiety
2        anxiety
3        anxiety
4        anxiety
          ...   
53038    anxiety
53039    anxiety
53040    anxiety
53041    anxiety
53042    anxiety
Name: status, Length: 52681, dtype: object

In [8]:
import unicodedata
#step 2
def remove_accented_chars(text):
    #with this function we will remove the accented characters and replace them with their english letter
    #here we are considering that all this would have happened by error and not meant for dealing with other
    #languages.
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #NFKD stand for Normalization Form Compatibility Decomposition which is a unicode normalization method.
    #what this line does is it decomposes the accented characters lets say é to e and "´"
    #then we encode this using ascii which ignores non ascii characters like "´"
    #finally we decode the ascii encodings to strings using utf-8
    return text

In [None]:
X = X.apply(remove_accented_chars)

In [None]:
X

In [10]:
#now we generally remove numbers. As we can see in our data the numbers are used for various purposes
#rarely for ages sometimes for a times period like say 20's
#we can see that removing them is good for our model
#but as this is a simpler model we are removing all numbers well in further devs it is important to use some
#context for example in GPT-4 4 should not be removed but for our purposes it is well and okay
#step 3
def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]' #so anything that is not a letter or a space is removed
    text = re.sub(pattern, '', text)
    return text

In [None]:
X=X.apply(remove_special_characters)

In [None]:
#next we want to expand contractions
#don't will be do not and so on for the entire dataset
contractions.contractions_dict.items()
#so what contractions do is it has a dictionary contractions_dict where the key is the contraction
#and the value is expansion of that contraction so nothing really fancy about how it works
#now using this dictionary one word by another and stuff that's boring right. well!
#contractions provide a function fix for that same purpose

In [None]:
#step 4
X = X.apply(contractions.fix)

In [None]:
#here in our case we are not doing things like pos tagging or chunking
#well we are really not interested in the grammar
#there might be a reason to do this if we might reach to a conclusion that depressed or other people might
#write smaller text use more adjectives etc. Well not really. We will use a neural network that will deal
#with the semantics rather than these syntax things which might not add much to predictions

In [None]:
#step 5 stemming or lemmatization
#well the words depressed depression and depressing has the same root word that we care about
#in our dataset they might be treated differently
#and we want to preserve the relations inspite of the morphological differences
#we can also perform stemming here but stemming is not as accurate as lemmatization
#lemmatization can be slower than stemming


In [None]:
# Load spaCy's large English model
nlp = spacy.load("en_core_web_lg")
def lemmatize_with_spacy(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_.isalpha()])

In [None]:
X = X.apply(lemmatize_with_spacy)

In [None]:
STOPWORDS = set(stopwords.words('english'))
NEGATION_WORDS = {"not", "no", "never", "none", "nor", "nothing", "nowhere", "hardly", "scarcely", "barely"}

# Remove stopwords but keep negation words
def remove_stopwords(text):
    """Remove stopwords except for important negation words"""
    return " ".join([word for word in str(text).split() if word.lower() not in STOPWORDS or word.lower() in NEGATION_WORDS])

In [None]:
X = X.apply(remove_stopwords)
X

In [None]:

#step 6 that is tokenization
# Apply tokenization only
X = X.astype(str).apply(lambda x: [token.text for token in nlp.tokenizer(x)])

In [None]:
X

In [15]:
import re
import contractions
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)  # remove special chars
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    doc = nlp(text)

    # Use token.text instead of token.lemma_ to avoid lemmatization
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]

    return tokens

X = X.apply(clean_text)


In [None]:
'''
from collections import Counter

def vocabulary_indices(tokens, min_freq=1):
    # Count word frequencies
    word_counts = Counter(tokens)

    # Filter words that meet the minimum frequency threshold
    vocabulary = {word for word, count in word_counts.items() if count >= min_freq}

    # Sort for consistent indexing
    sorted_vocab = sorted(vocabulary)  # Alphabetical order (can also sort by frequency)

    # Create mappings
    word2index = {word: i for i, word in enumerate(sorted_vocab)}
    index2word = {i: word for i, word in enumerate(sorted_vocab)}

    return vocabulary, word2index, index2word
'''

In [None]:
#hyperparameters:
#window_size = 4

In [None]:
'''
# Generate context pairs for each sentence (flattened list)
all_context_pairs = X.apply(lambda tokens: generate_context_pairs(tokens, window_size))

# Ensure we remove any None or empty lists
context_pairs = [pair for sublist in all_context_pairs.dropna() for pair in sublist]

# Ensure X is fully tokenized before building vocab
if isinstance(X.iloc[0], str):  # If X contains raw text instead of tokenized words
    raise ValueError("X contains raw text; it must be tokenized into lists of words first.")

# Collect all words for vocabulary (with repetitions)
all_vocab = [word for tokens in X for word in tokens]

# Build vocabulary
vocab, w2i, i2w = vocabulary_indices(all_vocab)
'''

In [None]:
'''unknown_idx = w2i.get("<UNK>", 0)  # Fallback index for unknown words
X_training = [(w2i.get(target, unknown_idx), w2i.get(context, unknown_idx)) for target, context in context_pairs]'''

In [None]:
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, learning_rate, epochs, batch_size, neg_samples=5):
        super(SkipGramModel, self).__init__()
        self.V = vocab_size  # Vocabulary size
        self.N = embedding_dim  # Embedding dimension
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.neg_samples = neg_samples

        # Word embeddings
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

        # Initialize embeddings using uniform distribution
        self.in_embed.weight.data.uniform_(-0.5 / embedding_dim, 0.5 / embedding_dim)
        self.out_embed.weight.data.uniform_(-0.5 / embedding_dim, 0.5 / embedding_dim)

        # Optimizer & learning rate scheduler
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        self.scheduler = ReduceLROnPlateau(self.optimizer, 'min', patience=2, factor=0.5)

    def forward(self, target, context, neg_context):
        """
        Compute the SkipGram loss using:
        - Positive samples (target, context)
        - Negative samples (target, neg_context)
        """
        # Positive pairs
        target_embeds = self.in_embed(target)  # (batch_size, embedding_dim)
        context_embeds = self.out_embed(context)  # (batch_size, embedding_dim)
        pos_score = torch.mul(target_embeds, context_embeds).sum(dim=1)
        pos_loss = F.logsigmoid(pos_score).squeeze()

        # Negative pairs
        neg_context_embeds = self.out_embed(neg_context)  # (batch_size, neg_samples, embedding_dim)
        neg_score = torch.bmm(neg_context_embeds, target_embeds.unsqueeze(2)).squeeze()
        neg_loss = F.logsigmoid(-neg_score).sum(dim=1)  # Sum over negative samples

        return -(pos_loss + neg_loss).mean()  # Negative log likelihood

    def train_model(self, dataset):
        """
        Train the SkipGram model using the given dataset.
        """
        data_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        device = self.get_best_device()
        self.to(device)
        print(f"🚀 Model initialized on: {device}")

        for epoch in range(self.epochs):
            total_loss = 0
            print(f"Epoch {epoch+1}/{self.epochs} on {device}")

            for batch_idx, (target_batch, context_batch, neg_batch) in enumerate(data_loader):
                target_batch = target_batch.to(device)
                context_batch = context_batch.to(device)
                neg_batch = neg_batch.to(device)  # Negative samples already computed

                self.optimizer.zero_grad()
                loss = self.forward(target_batch, context_batch, neg_batch)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

                if batch_idx % 100 == 0:
                    current_lr = self.optimizer.param_groups[0]['lr']
                if batch_idx%10000==0:
                    print(f"  Batch {batch_idx}: Loss = {loss.item():.4f}, LR = {current_lr:.6f}")

            avg_loss = total_loss / len(data_loader)
            self.scheduler.step(avg_loss)
            print(f"✅ Epoch {epoch+1}/{self.epochs}, Avg Loss: {avg_loss:.4f}")

    @staticmethod
    def get_best_device():
        """
        Automatically picks the best available device (CUDA → MPS → CPU).
        """
        if torch.cuda.is_available():
            return torch.device("cuda")
        elif torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")
'''

In [None]:
'''
def prepare_dataset(X_training, vocab_size, neg_samples=5):
    print("🔹 Preparing dataset with negative sampling...")
    print(f"  - Total training pairs: {len(X_training)}")
    print(f"  - Vocabulary size: {vocab_size}")
    print(f"  - Negative samples per word: {neg_samples}")

    targets, contexts = zip(*X_training)
    targets = torch.tensor(targets, dtype=torch.long)
    contexts = torch.tensor(contexts, dtype=torch.long)

    all_vocab = list(range(vocab_size))  # List of all words in vocabulary

    neg_contexts = []
    for i in range(len(targets)):
        negative_samples = []
        while len(negative_samples) < neg_samples:
            neg_word = random.choice(all_vocab)
            if neg_word != contexts[i].item():  # Ensure it's not the actual context
                negative_samples.append(neg_word)
        
        neg_contexts.append(torch.tensor(negative_samples, dtype=torch.long))
        
        

    neg_contexts = torch.stack(neg_contexts)

    print("✅ Dataset preparation complete!\n")
    return TensorDataset(targets, contexts, neg_contexts)
dataset = prepare_dataset(X_training, vocab_size, neg_samples=10)
# Example usage:
# X_training = [(target_word, context_word), ...]
# vocab_size = total number of unique words
# dataset = prepare_dataset(X_training, vocab_size, neg_samples=5)
'''

In [None]:
'''
embedding_dim = 150  # You can reduce this to 50-300 typically
learning_rate = 0.001  # Reduced from 0.01
epochs = 6
batch_size = 64
neg_samples = 10  # Number of negative samples per positive sample

model = SkipGramModel(vocab_size, embedding_dim, learning_rate, epochs, batch_size, neg_samples)
'''


In [None]:
'''
model.train_model(dataset)

In [None]:
'''def get_similar_words(model, word, word_to_index, index_to_word, top_n=5):
    """ Get top N most similar words based on cosine similarity """
    word_index = word_to_index.get(word, None)
    if word_index is None:
        print(f"Word '{word}' not in vocabulary.")
        return []

    word_embedding = model.in_embed.weight[word_index].detach().cpu()
    embeddings = model.in_embed.weight.detach().cpu()

    # Compute cosine similarity
    similarities = torch.cosine_similarity(word_embedding.unsqueeze(0), embeddings, dim=1)
    closest_indices = similarities.argsort(descending=True)[1:top_n+1]  # Ignore itself

    similar_words = [index_to_word[idx.item()] for idx in closest_indices]
    return similar_words'''

In [16]:
import logging
from gensim.models import Word2Vec

# Enable logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
tokenized_sentences = X  # This should be a list of lists of tokens

# Train Word2Vec without bigrams
w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    sg=1,
    min_count=3,
    workers=8,
    epochs=30
)



2025-04-09 16:07:48,380 : INFO : collecting all words and their counts
2025-04-09 16:07:48,381 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-09 16:07:48,437 : INFO : PROGRESS: at sentence #10000, processed 604345 words, keeping 20360 word types
2025-04-09 16:07:48,580 : INFO : PROGRESS: at sentence #20000, processed 2299761 words, keeping 33109 word types
2025-04-09 16:07:48,697 : INFO : PROGRESS: at sentence #30000, processed 3755918 words, keeping 40969 word types
2025-04-09 16:07:48,767 : INFO : PROGRESS: at sentence #40000, processed 4612728 words, keeping 46690 word types
2025-04-09 16:07:48,846 : INFO : PROGRESS: at sentence #50000, processed 5596126 words, keeping 55172 word types
2025-04-09 16:07:48,883 : INFO : collected 56239 word types from a corpus of 6067715 raw words and 52681 sentences
2025-04-09 16:07:48,883 : INFO : Creating a fresh vocabulary
2025-04-09 16:07:48,909 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 r

In [17]:
print(w2v_model.wv.most_similar("lonely"))
print(w2v_model.wv.most_similar(positive=["anxiety", "support"], negative=["pain"]))
print(w2v_model.wv.similarity("therapy", "meditation"))
print(w2v_model.wv.doesnt_match(["depression", "anxiety", "happiness", "ocd"]))


[('alone', 0.7591335773468018), ('sad', 0.707220733165741), ('depressed', 0.7047163248062134), ('miserable', 0.6860513687133789), ('empty', 0.6572108268737793), ('hopeless', 0.6511791348457336), ('isolated', 0.6439523100852966), ('deppresed', 0.6405414938926697), ('unloved', 0.6365382671356201), ('downtrodden', 0.6357808709144592)]
[('community', 0.5609440207481384), ('expanded', 0.5491969585418701), ('dss', 0.5194636583328247), ('health', 0.5176531076431274), ('realtime', 0.5124025344848633), ('partnered', 0.5076718926429749), ('network', 0.5042071342468262), ('services', 0.5002152323722839), ('selectivity', 0.4987241327762604), ('professional', 0.4979346990585327)]
0.5070648
happiness


In [18]:
# Reserve 0 for <PAD>, 1 for <UNK>
word2idx = {"<PAD>": 0, "<UNK>": 1}

# Add all words from Word2Vec vocab starting from index 2
word2idx.update({word: idx + 2 for idx, word in enumerate(w2v_model.wv.index_to_key)})


In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(Y)


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [55]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
device = ('mps' if torch.backends.mps.is_available else 'cpu')
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)

weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [23]:
device

NameError: name 'device' is not defined

In [56]:
embedding_dim = w2v_model.vector_size

embedding_matrix = np.zeros((len(word2idx), embedding_dim))
for word, idx in word2idx.items():
     
     if word in w2v_model.wv:
          embedding_matrix[idx] = w2v_model.wv[word]

def encode_sentence(tokens, word2idx):
    return [word2idx.get(token, 1) for token in tokens]


In [57]:
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

MAX_LEN = 50

# --- Modify Dataset ---
class MentalHealthDataset(Dataset):
    def __init__(self, X, y, word2idx): 
        self.X = [torch.tensor(encode_sentence(x, word2idx),dtype=torch.long) for x in X]
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- Define collate_fn ---
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = _text[:MAX_LEN]
        text_list.append(processed_text)
    padded_texts = pad_sequence(text_list, batch_first=True, padding_value=0).long()

    labels = torch.tensor(label_list, dtype=torch.long)
    return padded_texts, labels


train_dataset = MentalHealthDataset(X_train.tolist(), y_train, word2idx)
val_dataset = MentalHealthDataset(X_test.tolist(), y_test, word2idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=collate_batch)

In [None]:
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()
        embedding_dim = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            freeze=False 
        )
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(hidden_dim * 2, output_dim)
        )

    def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, (hn, cn) = self.lstm(embedded)
           
            final_hidden = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
            return self.fc(final_hidden) 



In [48]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTMClassifier(embedding_matrix, hidden_dim=128, output_dim=len(le.classes_)).to(device)
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

EPOCHS = 20
patience = 3
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_loss, all_preds, all_targets = 0, [], []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    val_loss /= len(val_loader)
    val_acc = accuracy_score(all_targets, all_preds)

    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")
    print(f"Epoch {epoch+1}/{EPOCHS} - Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_bilstm_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("⛔ Early stopping triggered.")
            break


Epoch 1/20 - Loss: 1.3568
Epoch 1/20 - Val Loss: 1.0837 | Val Acc: 0.5885
Epoch 2/20 - Loss: 0.9862
Epoch 2/20 - Val Loss: 0.9575 | Val Acc: 0.6197
Epoch 3/20 - Loss: 0.8505
Epoch 3/20 - Val Loss: 0.8721 | Val Acc: 0.6524
Epoch 4/20 - Loss: 0.7491
Epoch 4/20 - Val Loss: 0.8943 | Val Acc: 0.6349
Epoch 5/20 - Loss: 0.6760
Epoch 5/20 - Val Loss: 0.8387 | Val Acc: 0.7128
Epoch 6/20 - Loss: 0.6087
Epoch 6/20 - Val Loss: 0.8334 | Val Acc: 0.7106
Epoch 7/20 - Loss: 0.5466
Epoch 7/20 - Val Loss: 0.8366 | Val Acc: 0.7212
Epoch 8/20 - Loss: 0.5040
Epoch 8/20 - Val Loss: 0.8807 | Val Acc: 0.7101
Epoch 9/20 - Loss: 0.4526
Epoch 9/20 - Val Loss: 0.8664 | Val Acc: 0.7325
⛔ Early stopping triggered.


In [49]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_))


                      precision    recall  f1-score   support

             anxiety       0.76      0.82      0.79       755
             bipolar       0.75      0.77      0.76       527
          depression       0.73      0.56      0.64      3016
              normal       0.96      0.89      0.92      3308
personality disorder       0.38      0.69      0.49       237
              stress       0.50      0.55      0.52       536
            suicidal       0.58      0.74      0.65      2158

            accuracy                           0.73     10537
           macro avg       0.66      0.72      0.68     10537
        weighted avg       0.75      0.73      0.74     10537



In [None]:
original_index = X_train.index[0]
print("Original tokens:", X_bigrams[original_index])
print("Encoded tokens:", sample)



In [None]:
print(type(X))                      # List? Series? DataFrame?
print(type(X[1]))                   # List of tokens or something else?
print(X[1][:10])                    # First 10 tokens of the first sentence
print(encode_sentence(X[1], word2idx))

In [35]:
def text_to_sequence(tokens, max_len=50):
    seq = [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]
    if len(seq) < max_len:
        seq += [word2idx["<PAD>"]] * (max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq


In [58]:
text = "Nope. No, I can't go any further. I am tired every day is a war with myself"
tokens = clean_text(text)
sequence = text_to_sequence(tokens)
input_tensor = torch.tensor(sequence).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model(input_tensor)
    prediction = torch.argmax(output, dim=1).item()

predicted_label = le.inverse_transform([prediction])[0]
print("Predicted mental health condition:", predicted_label)


Predicted mental health condition: suicidal
