In [1]:
#install the necessary library 
!pip install transformers
!pip install transformers[torch]



In [2]:
!pip install sklearn
!pip install torch
!pip install transformers[torch]



In [3]:
#import necessary libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

In [4]:
#load the training set and the test set 
train_df = pd.read_csv('C://Users//Sofia_Chen//Desktop//ds//train_b.csv',encoding='ISO-8859-1')
test_df = pd.read_csv('C://Users//Sofia_Chen//Desktop//ds//test_b.csv',encoding='ISO-8859-1')


In [5]:
train_df.head()
#remove the not necessry column 
#column_dr=['Unnamed: 2', 'Unnamed: 3']
#train_df=train_df.drop(column_dr, axis=1)
train_df.head()
train_df.describe()

Unnamed: 0,label
count,2828.0
mean,0.52546
std,0.49944
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
#check the missing value 
missing_train = train_df.isnull().sum()
print(missing_train)

text     0
label    0
dtype: int64


In [7]:
test_df.head()
test_df.describe()

Unnamed: 0,label
count,715.0
mean,0.516084
std,0.500091
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
#check the missing value 
missing_test = test_df.isnull().sum()
print(missing_test)

text     0
label    0
dtype: int64


In [9]:
# Convert label columns to integers
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

In [10]:
# Define the dataset class
class StressDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [11]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
# Prepare the datasets
train_data = StressDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
test_data = StressDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)


In [13]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_df['label'].unique()))

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Train the model
trainer.train()

  0%|          | 0/1062 [00:00<?, ?it/s]

  0%|          | 0/90 [00:02<?, ?it/s]

{'eval_loss': 0.42403385043144226, 'eval_runtime': 255.1531, 'eval_samples_per_second': 2.802, 'eval_steps_per_second': 0.353, 'epoch': 1.0}
{'loss': 0.5189, 'grad_norm': 1.9521071910858154, 'learning_rate': 5e-05, 'epoch': 1.41}


Checkpoint destination directory ./results\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  0%|          | 0/90 [00:01<?, ?it/s]

{'eval_loss': 0.5790799856185913, 'eval_runtime': 244.7531, 'eval_samples_per_second': 2.921, 'eval_steps_per_second': 0.368, 'epoch': 2.0}


Checkpoint destination directory ./results\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.2858, 'grad_norm': 0.06355748325586319, 'learning_rate': 5.516014234875446e-06, 'epoch': 2.82}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.9637812972068787, 'eval_runtime': 241.0469, 'eval_samples_per_second': 2.966, 'eval_steps_per_second': 0.373, 'epoch': 3.0}
{'train_runtime': 10470.1529, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.101, 'train_loss': 0.3869064208926903, 'epoch': 3.0}


TrainOutput(global_step=1062, training_loss=0.3869064208926903, metrics={'train_runtime': 10470.1529, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.101, 'train_loss': 0.3869064208926903, 'epoch': 3.0})

In [18]:
predictions = trainer.predict(test_data)
predicted_labels = predictions.predictions.argmax(-1)




  0%|          | 0/90 [00:00<?, ?it/s]

{'test_loss': 0.9637812972068787, 'test_runtime': 249.099, 'test_samples_per_second': 2.87, 'test_steps_per_second': 0.361}


In [21]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

predictions = trainer.predict(test_data)

# Convert logits to predicted class (0 or 1) assuming binary classification
pred_labels = np.argmax(predictions.predictions, axis=1)

# True labels
true_labels = predictions.label_ids

# Calculate metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

  0%|          | 0/90 [00:00<?, ?it/s]

Accuracy: 0.7944055944055944
Precision: 0.7642857142857142
Recall: 0.8699186991869918
F1 Score: 0.8136882129277566


In [19]:
# If you have defined compute_metrics
print(predictions.metrics)

{'test_loss': 0.9637812972068787, 'test_runtime': 249.099, 'test_samples_per_second': 2.87, 'test_steps_per_second': 0.361}
