In [1]:
import pandas as pd
from kaggle_secrets import UserSecretsClient
from huggingface_hub import HfApi, HfFolder
import os

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

# Set the token for use in the huggingface_hub library
HfFolder.save_token(hf_token)

import wandb

# Replace 'your-api-token' with your actual API token from wandb.ai
wandb.login(key='88bc0f92c0138587605140040c6d0ef652bde63d')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [47]:
df = pd.read_csv("/kaggle/input/mental-health-corpus/mental_health.csv")

In [48]:
df["label"].value_counts()

label
0    14139
1    13838
Name: count, dtype: int64

In [49]:
df2 = pd.read_csv("/kaggle/input/mental-health-social-media/Mental-Health-Twitter.csv")

In [50]:
df2 = df2[["post_text", "label"]]

In [51]:
df2.rename(columns={'post_text': 'text'}, inplace=True)

In [52]:
df3 = pd.read_csv("/kaggle/input/depression-data-set-with-depression-level/depression_data_normal_final.csv")

In [53]:
df3 = df3[["clean_text", "is_depression"]]

In [54]:
df3

Unnamed: 0,clean_text,is_depression
0,understand people reply immediately op invitat...,1
1,welcome r depression check post place take mom...,1
2,anyone else instead sleeping depressed stay ni...,1
3,kind stuffed around lot life delaying inevitab...,1
4,sleep greatest comforting escape whenever wake...,1
...,...,...
7726,snow,0
7727,moulin rouge mad cry,0
7728,trying shout find people list,0
7729,ughh find red sox hat gotta wear creepy nick p...,0


In [55]:
df3.rename(columns={'clean_text': 'text', 'is_depression': 'label'}, inplace=True)

In [56]:
df = pd.concat([df, df2, df3], ignore_index=True)

In [57]:
df["label"].value_counts()

label
0    28039
1    27669
Name: count, dtype: int64

In [60]:
df = df.sample(frac=1).reset_index(drop=True)

In [61]:
df

Unnamed: 0,text,label
0,@tyleroakley WHAT AM I DOING RIGHT NOW,1
1,branch military dedicated stopping china explo...,0
2,Morning motivation❤️ http://t.co/KTogRSN4FT,1
3,think really afford professional help medicati...,1
4,rellyab likely unable attend cry way home,0
...,...,...
55703,hold barney drilling head every day well guess...,0
55704,"@PaulManafort 2,800,000? What happened to you ...",0
55705,If you're trying to have a good time in Brooki...,1
55706,idea people say always count never bottle feel...,1


In [62]:
df = pd.get_dummies(df, columns=["label"])
df["label_0"] = df["label_0"].astype(int)
df["label_1"] = df["label_1"].astype(int)

In [63]:
X_data = df["text"]
y_data = df[["label_0", "label_1"]]

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.3, stratify=y_data, random_state=42
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (38995,)
y_train shape: (38995, 2)
X_test shape: (16713,)
y_test shape: (16713, 2)


In [65]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [66]:
def preprocess_function(examples):
    #print(examples)
    return tokenizer(str(examples), truncation=True, padding="max_length", max_length=512)

In [67]:
X_train = X_train.map(preprocess_function)
X_test = X_test.map(preprocess_function)

In [68]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [69]:
%pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [70]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [71]:
id2label = {0: "POSITIVE", 1: "NEGATIVE"}
label2id = {"POSITIVE": 0, "NEGATIVE": 1}
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
from datasets import Dataset
y_train = np.argmax(y_train, axis=1)
y_test = np.argmax(y_test, axis=1)
X_train = pd.DataFrame(X_train.tolist(), columns=['input_ids', 'attention_mask'], index=X_train.index)
X_test = pd.DataFrame(X_test.tolist(), columns=['input_ids', 'attention_mask'], index=X_test.index)


In [73]:
X_train

Unnamed: 0,input_ids,attention_mask
2444,"[101, 2053, 2028, 19821, 2033, 2030, 2129, 104...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
14644,"[101, 19387, 1030, 9395, 29336, 18410, 4502, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
41349,"[101, 7917, 4942, 2130, 2113, 2130, 2505, 5665...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
49515,"[101, 1030, 20848, 10513, 3490, 2696, 2092, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
13140,"[101, 1999, 2637, 3102, 3209, 14085, 25929, 22...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
43122,"[101, 10587, 2175, 2210, 2346, 4440, 3748, 263...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
35611,"[101, 1030, 13573, 21486, 21486, 21926, 2293, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
37153,"[101, 2412, 2144, 2034, 2209, 3585, 17683, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
10803,"[101, 5310, 1024, 2488, 2084, 2047, 16770, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [74]:
train_dataset = Dataset.from_dict({
    "input_ids": X_train['input_ids'],
    "attention_mask": X_train['attention_mask'],
    "labels": y_train
})

test_dataset = Dataset.from_dict({
    "input_ids": X_test['input_ids'],
    "attention_mask": X_test['attention_mask'],
    "labels": y_test
})

In [75]:
#from transformers import DataCollatorWithPadding
#data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

training_args = TrainingArguments(
    output_dir="Mental_health_identification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.223,0.175703,0.926464
2,0.1406,0.180817,0.934183




TrainOutput(global_step=2438, training_loss=0.20033991268367626, metrics={'train_runtime': 2314.1507, 'train_samples_per_second': 33.701, 'train_steps_per_second': 1.054, 'total_flos': 1.033113242118144e+16, 'train_loss': 0.20033991268367626, 'epoch': 2.0})

In [76]:
trainer.push_to_hub()

events.out.tfevents.1711847880.23bf28c87935.34.1:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DaJulster/Mental_health_identification/commit/763901c898d6c341e642ba7edd117cf74fa1d2d9', commit_message='End of training', commit_description='', oid='763901c898d6c341e642ba7edd117cf74fa1d2d9', pr_url=None, pr_revision=None, pr_num=None)