In [24]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from openpyxl.styles.builtins import output
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from torchvision import datasets
from torchvision.transforms import ToTensor
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

In [49]:
path = "../model_bert_mental2"


In [51]:
import os
if os.path.exists(path):
    print("Model already exists")
else:
    print("Model does not exist")


Model already exists


In [25]:
df = pd.read_csv('Combined Data.csv')
print(df.shape)
df.head()

(53043, 3)


Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [26]:
df = df[['statement', 'status']]

In [27]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [28]:
df['status'].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [29]:
df.isnull().sum()

statement    362
status         0
dtype: int64

In [30]:
df.dropna(inplace=True)

In [31]:
df.isnull().sum()

statement    0
status       0
dtype: int64

In [32]:
df.duplicated().sum()

1588

In [33]:
df.drop_duplicates(inplace=True, keep='first')

In [34]:
df.duplicated().sum()

0

In [35]:
df2 = df.sample(6000).reset_index(drop=True)
df2.shape

(6000, 2)

In [36]:
df2['status'].value_counts()

status
Normal                  1889
Depression              1805
Suicidal                1232
Anxiety                  420
Stress                   272
Bipolar                  271
Personality disorder     111
Name: count, dtype: int64

In [37]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df2[['statement']], df2['status'])



In [38]:
df_resambled = pd.concat([X_resampled, y_resampled], axis=1)
df_resambled.head()

Unnamed: 0,statement,status
0,"I am complete failure ,I am ugly as hell I am ...",Suicidal
1,Website blockers help a lot I block all search...,Anxiety
2,Being in a relationship as a bipolar person is...,Bipolar
3,the car isn't here.,Normal
4,I'm not gay I'm just GAY :D,Normal


In [39]:
df_resambled['status'].value_counts()

status
Suicidal                1889
Anxiety                 1889
Bipolar                 1889
Normal                  1889
Depression              1889
Personality disorder    1889
Stress                  1889
Name: count, dtype: int64

In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_resambled['status'] = label_encoder.fit_transform(df_resambled['status'])
df_resambled.head()

Unnamed: 0,statement,status
0,"I am complete failure ,I am ugly as hell I am ...",6
1,Website blockers help a lot I block all search...,0
2,Being in a relationship as a bipolar person is...,1
3,the car isn't here.,3
4,I'm not gay I'm just GAY :D,3


In [41]:
df_resambled['status'].value_counts()

status
6    1889
0    1889
1    1889
3    1889
2    1889
4    1889
5    1889
Name: count, dtype: int64

In [42]:
from joblib import dump
dump(label_encoder, 'label_encoder.joblib')

['label_encoder.joblib']

In [43]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt_tab')
nltk.download('stopwords')

def cleaned_text(text):
    text = text.lower()
    token = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in token if word not in stop_words and string.punctuation and word.isalnum()]
    return " ".join(filtered_sentence)

text = "I am learning NLP ### AA @@@ !!! , any one can HEPL ME OUT ??? "
cleaned_text(text)

[nltk_data] Downloading package punkt_tab to /Users/ricky/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ricky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'learning nlp aa one hepl'

In [44]:
X = df_resambled['statement'].apply(cleaned_text)
y = df_resambled['status']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_tokenizer = tokenizer(list(X_train), padding=True, truncation=True, max_length=128)
test_tokenizer = tokenizer(list(X_test), padding=True, truncation=True, max_length=128)




In [47]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'input_ids': train_tokenizer['input_ids'], 'attention_mask': train_tokenizer['attention_mask'], 'labels': y_train.tolist()})
test_dataset = Dataset.from_dict({'input_ids': test_tokenizer['input_ids'], 'attention_mask': test_tokenizer['attention_mask'], 'labels': y_test.tolist()})

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
num_labels = len(df_resambled['status'].unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",          # Output directory for results
    evaluation_strategy="epoch",     # Evaluate once per epoch
    save_strategy="epoch",          # Save model at the end of each epoch to match evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=5,              # Increase number of epochs
    weight_decay=0.01,               # Strength of weight decay
    logging_dir="./logs",            # Directory for logging
    logging_steps=10,                # Log every 10 steps
    lr_scheduler_type="linear",      # Use linear learning rate scheduler with warmup
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="eval_loss", # Monitor eval loss to determine the best model
    save_total_limit=3,              # Limit the number of checkpoints to save
    gradient_accumulation_steps= 2   # Simulate larger batch size if GPU memory is limite
)

trainer = Trainer(
    model = model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

In [54]:
from sklearn.metrics import classification_report, confusion_matrix

predictions, labels, _ = trainer.predict(test_dataset)
predictions_label = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions_label, target_names= label_encoder.classes_))


In [None]:
cm = confusion_matrix(labels, predictions_label)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')

In [None]:
# trainer.save_model('/content/drive/MyDrive/Mental project/model_bert_mental2')
# tokenizer.save_pretrained('/content/drive/MyDrive/Mental project/model_bert_mental')

In [55]:
model = AutoModelForSequenceClassification.from_pretrained("../model_bert_mental2").to(device)
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Mental project/model_bert_mental')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model

In [56]:
def predict_sentiment(text):
    text_cleaned = cleaned_text(text)
    inputs = tokenizer(text_cleaned, padding=True, truncation=True, max_length=128, return_tensors="pt")
    # Move input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]


sample_texts = [
    "I feel perfectly fine today, nothing to worry about.",
    "I can't stop thinking about what will happen if things go wrong.",
    "Lately, I've been on a high, feeling like I can do anything!",
    "I'm so sad, I just can't seem to get out of bed anymore.",
    "I'm constantly thinking about how much better everyone else is doing than me.",
    "I don't think I can keep going, everything feels so hopeless.",
    "I had a really good day, spent some time with my friends.",
    "I'm overwhelmed by the idea that I might lose everything.",
    "I feel like nothing matters anymore, I just want to give up.",
    "I'm okay today, but sometimes I get really anxious for no reason."
]

for text in sample_texts:
    predicted_sentiment = predict_sentiment(text)
    print(f"Text: {text}\nPredicted Sentiment: {predicted_sentiment}\n")

Text: I feel perfectly fine today, nothing to worry about.
Predicted Sentiment: Anxiety

Text: I can't stop thinking about what will happen if things go wrong.
Predicted Sentiment: Anxiety

Text: Lately, I've been on a high, feeling like I can do anything!
Predicted Sentiment: Depression

Text: I'm so sad, I just can't seem to get out of bed anymore.
Predicted Sentiment: Normal

Text: I'm constantly thinking about how much better everyone else is doing than me.
Predicted Sentiment: Normal

Text: I don't think I can keep going, everything feels so hopeless.
Predicted Sentiment: Suicidal

Text: I had a really good day, spent some time with my friends.
Predicted Sentiment: Normal

Text: I'm overwhelmed by the idea that I might lose everything.
Predicted Sentiment: Depression

Text: I feel like nothing matters anymore, I just want to give up.
Predicted Sentiment: Suicidal

Text: I'm okay today, but sometimes I get really anxious for no reason.
Predicted Sentiment: Anxiety

