In [2]:
import os
import random
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from torchvision import transforms
from model.depression_model import DepressionDetectionModel


In [3]:
TEXT_CSV = "datasets/text/depression_dataset_reddit_cleaned.csv"
IMAGE_DIR = "datasets/images/combined"


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

def get_random_image(label):
    if label == 1:
        folder = os.path.join(IMAGE_DIR, "sad")
    else:
        folder = random.choice([
            os.path.join(IMAGE_DIR, "happy"),
            os.path.join(IMAGE_DIR, "neutral")
        ])
    return os.path.join(folder, random.choice(os.listdir(folder)))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
class MultimodalDataset(Dataset):
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path).dropna()
        self.data = self.data[self.data["is_depression"].isin([0, 1])]
        self.data = self.data.sample(frac=1).reset_index(drop=True)  # shuffle

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text, label = row["clean_text"], int(row["is_depression"])

        # Preprocess text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            padding="max_length",
            truncation=True
        )
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Preprocess image
        image_path = get_random_image(label)
        image = Image.open(image_path).convert("RGB")
        image_tensor = image_transform(image)

        return input_ids, attention_mask, image_tensor, torch.tensor(label, dtype=torch.float32)


In [13]:
dataset = MultimodalDataset(TEXT_CSV)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

model = DepressionDetectionModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\Sahil Wadhwani/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|█████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:01<00:00, 24.6MB/s]


In [14]:
EPOCHS = 3

for epoch in range(EPOCHS):
    total_loss = 0
    model.train()

    for input_ids, attention_mask, images, labels in dataloader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        outputs = model(input_ids, attention_mask, images)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(dataloader):.4f}")


KeyboardInterrupt: 