#Importing

In [2]:
!pip install datasets




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
from datasets import Dataset    #dataset need csv to transformers form so that neededd DataLoader,Dataset

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



#load dataset

In [None]:
data = pd.read_csv('/content/Combined Data.csv')
data.dropna(inplace=True)
data.drop(columns=['Unnamed: 0'],axis=1, inplace=True)

In [None]:
data

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...
53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


In [None]:
data = data.sample(n=5000, random_state=42).reset_index(drop=True)

In [None]:
data.shape

(5000, 2)

#Data Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_statement(text):
  text = text.lower()
  text = re.sub(r"[^a-zA-Z\s]",'',text)

  words = text.split()
  words = [word for word in words if word not in stop_words]

  return " ".join(words)

data['clean'] = data['statement'].apply(clean_statement)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data.head()

Unnamed: 0,statement,status,clean
0,I'm lazy to complain about it ba ihh,Normal,im lazy complain ba ihh
1,i think the wifi on my iphone is broken it wil...,Normal,think wifi iphone broken quot connect quot act...
2,Good tracking apps? I've been trying to find a...,Bipolar,good tracking apps ive trying find apps track ...
3,I have recently looked into reddit and found t...,Depression,recently looked reddit found place actually qu...
4,that's your favorite thing to do?,Normal,thats favorite thing


In [None]:
data.drop(columns=['statement'],inplace=True)

In [None]:
data['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,1576
Depression,1437
Suicidal,1029
Anxiety,349
Bipolar,255
Stress,241
Personality disorder,113


# Balance Dataset

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

x = data.drop(columns=['status'])
y = data['status']

x_resampled, y_resampled = ros.fit_resample(x, y)

data = pd.concat([x_resampled, y_resampled], axis=1)


In [None]:
data.head()

Unnamed: 0,clean,status
0,im lazy complain ba ihh,Normal
1,think wifi iphone broken quot connect quot act...,Normal
2,good tracking apps ive trying find apps track ...,Bipolar
3,recently looked reddit found place actually qu...,Depression
4,thats favorite thing,Normal


In [None]:
data['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,1576
Bipolar,1576
Depression,1576
Suicidal,1576
Stress,1576
Personality disorder,1576
Anxiety,1576


#Encoding

In [None]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['status'])
data

Unnamed: 0,clean,status,label
0,im lazy complain ba ihh,Normal,3
1,think wifi iphone broken quot connect quot act...,Normal,3
2,good tracking apps ive trying find apps track ...,Bipolar,1
3,recently looked reddit found place actually qu...,Depression,2
4,thats favorite thing,Normal,3
...,...,...,...
11027,graduated highschool worry exams times graduat...,Suicidal,6
11028,wanting meet someone always lonely anyways liv...,Suicidal,6
11029,closer ever keep imagining parents feel sister...,Suicidal,6
11030,hey person asking advice last night painless s...,Suicidal,6


#Train Test Spilt

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data['clean'],data['label'], test_size=0.2)

#Tokenization

In [None]:
max([len(text) for text in data['clean']])

5893

In [None]:
# Tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True,max_length=200)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True,max_length=200)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_encodings

In [None]:
from datasets import Dataset

# Convert tokenized data into Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})


In [None]:
train_dataset[0]

{'input_ids': [101,
  2342,
  14901,
  2342,
  6040,
  7631,
  2203,
  2975,
  23655,
  2123,
  2102,
  2482,
  2180,
  2102,
  2113,
  3902,
  4060,
  7713,
  2036,
  3061,
  24385,
  6098,
  2296,
  6928,
  3204,
  5223,
  2175,
  2188,
  8980,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

#  Fine-Tuning BERT

In [None]:
len(label_encoder.classes_)


7

In [None]:
from transformers import TrainingArguments

#Set training arguments
training_args = TrainingArguments(
    output_dir='./results',                # output directory for results
    evaluation_strategy="epoch",            # evaluate every epoch
    save_strategy="epoch",                  # save every epoch
    learning_rate=2e-5,                     # learning rate
    per_device_train_batch_size=16,         # batch size for training
    per_device_eval_batch_size=16,          # batch size for evaluation
    num_train_epochs=3,                     # number of training epochs
    weight_decay=0.01,                      # weight decay for regularization
    logging_dir='./logs',                   # directory for logs
    logging_steps=10,                       # log every 10 steps
    lr_scheduler_type='linear',             # linear learning rate scheduler
    warmup_steps=500,                       # number of warmup steps
    load_best_model_at_end=True,            # load the best model at the end of training
    metric_for_best_model="accuracy",      # metric for choosing the best model
    save_total_limit=3,                     # limit the number of saved models
    gradient_accumulation_steps=2,          # gradient accumulation steps


)




In [None]:
from sklearn.metrics import accuracy_score

# Define a function to compute accuracy
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

In [None]:
# Trainer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset =  train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics
)

# Fine-tuning the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2147,1.139797,0.623018
2,0.4509,0.371369,0.873131
3,0.2029,0.28312,0.897599


TrainOutput(global_step=828, training_loss=0.8664802681420736, metrics={'train_runtime': 930.9686, 'train_samples_per_second': 28.438, 'train_steps_per_second': 0.889, 'total_flos': 2721163245750000.0, 'train_loss': 0.8664802681420736, 'epoch': 3.0})

# Evaluation

In [None]:
predictions, labels = trainer.predict(test_dataset)

predicted_labels = np.argmax(predictions, axis=1)

print(classification_report(test_labels, predicted_labels, target_names=label_encoder.classes_))

cm = confusion_matrix(test_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

ValueError: too many values to unpack (expected 2)

In [None]:
outputs = trainer.predict(test_dataset)
predictions = outputs.predictions
predicted_labels = np.argmax(predictions, axis=1)

print(classification_report(test_labels, predicted_labels, target_names=label_encoder.classes_))

cm = confusion_matrix(test_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


#Save model and Load Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# save the trained model and tokenizer
trainer.save_model("/content/drive/MyDrive/GenAI Project(1-99)/Mental Health with BERT | Fine-Tuning BERT/mental_trainer")
tokenizer.save_pretrained("/content/drive/MyDrive/GenAI Project(1-99)/Mental Health with BERT | Fine-Tuning BERT/mental_token")



from transformers import AutoModelForSequenceClassification, AutoTokenizer

# load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/GenAI Project(1-99)/Mental Health with BERT | Fine-Tuning BERT/mental_trainer")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/GenAI Project(1-99)/Mental Health with BERT | Fine-Tuning BERT/mental_token")



#Detection System

In [None]:
import torch

def detection(text):
  text = clean_statement(text)
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=200)
  outputs = model(**inputs)
  logits = outputs.logits
  prediction_label = torch.argmax(logits, dim=1).item()  # Use torch.argmax instead of torch.armax

  return label_encoder.inverse_transform([prediction_label])[0]  # 1= normal=1(inverse)

text = "I am feeling so anxious and overwhelmed with work today."
detection(text)



'Anxiety'