In [None]:
import pandas as pd
import numpy as np  
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizerFast
import re
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
data = pd.read_csv('../Dataset/data.csv', delimiter=',',escapechar='\\',header=0,on_bad_lines='skip', encoding='utf-8')

In [3]:
data.drop(columns= "Unnamed: 0", inplace=True)

In [4]:
data = data[data['statement'].notnull() & (data['statement'].str.strip() != '')]

In [5]:
def remove_patterns(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [6]:
data['statement'] = data['statement'].apply(remove_patterns)

In [7]:
lables = data['status'].unique().tolist()
labeles = [s.strip() for s in lables]

In [8]:
NUM_LABELS = len(lables)

id2labels={id:label for id, label in enumerate(lables)}
labels2id={label:id for id, label in enumerate(lables)}

In [9]:
labels2id = {'Anxiety': 0,
 'Normal': 1,
 'Depression': 2,
 'Stress': 3,
 'Bipolar': 4,
 'Personality disorder': 5}

In [10]:
id2labels = {0: 'Anxiety',
             1: 'Normal',
             2: 'Depression',
             3: 'Stress',
             4: 'Bipolar',
             5: 'Personality disorder'}

In [11]:
data['labels']= data.status.map(lambda x: labels2id[x.strip()])

In [None]:
data.shape

In [13]:
data.to_csv('data.csv', index = False)

In [None]:
data.head()

In [None]:
data['status'].value_counts()

In [None]:
data.head()

In [None]:
data.sample(frac=1)

In [18]:
# Set your maximum sample size per class
max_samples_per_class = 2000

# Group by 'status' and sample up to max_samples_per_class for each group
train = data.groupby('status').apply(lambda x: x.sample(min(len(x), max_samples_per_class))).reset_index(drop=True)

In [None]:
train.shape

In [None]:
train['status'].value_counts()

In [22]:
# Concatenate the two datasets and mark their origin
combined = pd.concat([data.assign(source='data'), train.assign(source='train')])

# Select only the rows from 'data' that are not in 'train'
test = combined.drop_duplicates(subset=data.columns, keep=False)
test = test[test['source'] == 'data'].drop(columns=['source'])

In [23]:
# Filter rows where the disorder is "Personality disorder"
personality_disorder_df = data[data['status'] == "Personality disorder"]

# If you want to add it to another DataFrame, say 'new_df', you can concatenate it like this
test = pd.concat([test, personality_disorder_df]).reset_index(drop=True)

In [None]:
test.shape

In [25]:
train.to_csv('Dataset/train.csv', index = False)
test.to_csv('Dataset/test.csv', index = False)

In [26]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

In [None]:
import torch
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu) 

In [30]:
torch.cuda.set_per_process_memory_fraction(0.8)

In [None]:
model.to(gpu)

In [32]:
X_train = pd.DataFrame(train[['statement']])
Y_train = pd.DataFrame(train['labels'])

In [None]:
test['status'].value_counts()

In [34]:
val =  test.groupby('status').apply(lambda x: x.sample(200)).reset_index(drop=True)

In [35]:
X_valid = pd.DataFrame(val[['statement']])
Y_valid = pd.DataFrame(val['labels'])

In [36]:
X_test = pd.DataFrame(test['statement'])
Y_test = pd.DataFrame(test['labels'])

In [37]:
X_train = X_train['statement'].tolist()
X_valid = X_valid['statement'].tolist() 
X_test = X_test['statement'].tolist() 

In [38]:
Y_train = Y_train['labels'].tolist()
Y_valid = Y_valid['labels'].tolist() 
Y_test = Y_test['labels'].tolist() 

In [39]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_valid, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [40]:
class DataLoader(Dataset):
    def __init__(self, encoding, labels):
        self.encoding = encoding
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])  for key, val in self.encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataloader = DataLoader(train_encodings, Y_train)
val_dataloader = DataLoader(val_encodings, Y_valid)
test_dataloader = DataLoader(test_encodings, Y_test)

In [41]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ =  precision_recall_fscore_support(labels, preds, average='macro')
    acc =  accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:
training_args = TrainingArguments(
    output_dir='/results',
    do_train=True,
    do_eval=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    warmup_steps=0,
    weight_decay=0.0000002,
    logging_strategy='steps',
    learning_rate= 0.00008,

    logging_dir='/logs',
    logging_steps=250,
    evaluation_strategy='steps',
    eval_steps=250,
    save_strategy='steps',
    load_best_model_at_end=True
)

In [43]:
trainer = Trainer(
    model = model,

    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataloader]]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:,:5]

In [46]:
model_path = "model/9model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)