In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm

  torch.utils._pytree._register_pytree_node(


In [6]:
!pip install torch 
!pip install transformers



In [2]:
# Step 1: Data Preparation
class DepressionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [3]:
# Load Dataset
data_path = "C:/Users/octanet/OneDrive/Bureau/ieee/embs challange/work/depression1/depression_dataset_reddit_cleaned.csv"
dataset = pd.read_csv(data_path)

In [4]:
# Extract texts and labels
texts = dataset['clean_text'].tolist()
labels = dataset['is_depression'].tolist()

In [5]:
# Split Data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [6]:
# Initialize Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
!pip install ipywidgets



In [7]:
# Create Datasets
train_dataset = DepressionDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = DepressionDataset(val_texts, val_labels, tokenizer, max_length=128)

In [8]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [9]:

# Step 2: Model Setup
# Load the pretrained BERT model for sequence classification
# Setting num_labels=2 for binary classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=2
)

# Check if GPU (CUDA) is available, else default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device
model.to(device)

# Print confirmation of the device being used
print(f"Model is set up on device: {device}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is set up on device: cpu


In [26]:
# Step 3: Training
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f'Epoch {epoch + 1}')
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")

# Step 4: Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Classification Report
print(classification_report(true_labels, predictions, target_names=['Not Depressed', 'Depressed']))

# Step 5: Making Predictions
def predict(text, model, tokenizer, max_length=128):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

    return "Depressed" if preds.item() == 1 else "Not Depressed"

# Test the Prediction Function
sample_text = "I feel like nothing matters anymore."
print(f"Prediction: {predict(sample_text, model, tokenizer)}")

Epoch 1:   0%|▏                                                       | 2/773 [00:11<1:11:14,  5.54s/it, loss=0.000868]


KeyboardInterrupt: 

In [11]:
# Step 4: Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


In [29]:
# Classification Report
print(classification_report(true_labels, predictions, target_names=['Not Depressed', 'Depressed']))

               precision    recall  f1-score   support

Not Depressed       0.97      0.98      0.98       783
    Depressed       0.98      0.97      0.97       764

     accuracy                           0.98      1547
    macro avg       0.98      0.98      0.98      1547
 weighted avg       0.98      0.98      0.98      1547



In [30]:
# Step 5: Making Predictions
def predict(text, model, tokenizer, max_length=128):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

    return "Depressed" if preds.item() == 1 else "Not Depressed"


In [33]:
# Test the Prediction Function
sample_text = "I am laughing"
print(f"Prediction: {predict(sample_text, model, tokenizer)}")

Prediction: Not Depressed
