In [None]:
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
df = pd.read_csv("mental_health.csv")

In [3]:
df.head()

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


In [4]:
X = df['text']
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22381,), (5596,), (22381,), (5596,))

In [8]:
y_train.value_counts()

label
0    11311
1    11070
Name: count, dtype: int64

In [9]:
y_test.value_counts()

label
0    2828
1    2768
Name: count, dtype: int64

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [12]:
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
for param in model.distilbert.transformer.layer[:4].parameters():
    param.requires_grad = False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
model.train()
num_epochs = 2
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} | Average Loss: {avg_loss:.4f}")

In [None]:
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [None]:
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score, confusion_matrix

In [None]:
print(f"f1_score: {f1_score(true_labels, preds):.4f}")
print(f"accuracy_score: {accuracy_score(true_labels, preds):.4f}")
print(f"recall_score: {recall_score(true_labels, preds):.4f}")
print(f"precision_score: {precision_score(true_labels, preds):.4f}")
print(f"roc_auc_score: {roc_auc_score(true_labels, preds):.4f}")

In [None]:
print("\nClassification report:")
print(classification_report(true_labels, preds))

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
cm = confusion_matrix(true_labels, preds)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['0', '1'],
            yticklabels=['0', '1'])
plt.xlabel('Прогнозований клас')
plt.ylabel('Справжній клас')
plt.title('Матриця плутанини')
plt.show()

In [None]:
model.save_pretrained("models")
tokenizer.save_pretrained("models")

In [None]:
!zip -r models.zip models

In [None]:
from google.colab import files
files.download("models.zip")