In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/suicidal-mental-health-dataset/mental-health.csv


In [2]:
from sklearn.model_selection import train_test_split
from transformers import FNetTokenizer, FNetForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

In [3]:
!wandb login e59dd18edc25b5567acba163a79b06b09cd0c77e

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
df = pd.read_csv('/kaggle/input/suicidal-mental-health-dataset/mental-health.csv')

df['label'] = df['label'].str.replace('SuicideWatch', 'suicidal')
df['label'] = df['label'].str.lower()

binarized = []
for label in df['label']:
    if label == 'suicidal':
        binarized.append(1)
    else:
        binarized.append(0)

binarized = pd.Series(binarized, name='label')
df = df.drop('label', axis=1)
df = pd.concat([df, binarized], axis=1)

df = df[df['text'].apply(lambda text: len(text.split(' '))) <= 512]

train_df, other = train_test_split(
    df,
    test_size = 0.12,
    random_state = 9550,
    shuffle = True,
    stratify = df['label'],
)

val_df, test_df = train_test_split(
    other,
    test_size = 0.5,
    random_state = 9550,
    shuffle = True,
    stratify = other['label'],
)

In [5]:
x_train = list(map(lambda x: x.lower(), train_df['text'].values))
y_train = list(train_df['label'].values)

x_val = list(map(lambda x: x.lower(), val_df['text'].values))
y_val = list(val_df['label'].values)

x_test = list(map(lambda x: x.lower(), test_df['text'].values))
y_test = list(test_df['label'].values)

In [6]:
train_dataset = Dataset.from_dict({"texts" : x_train, "labels" : y_train})
val_dataset = Dataset.from_dict({"texts" : x_val, "labels" : y_val})
test_dataset = Dataset.from_dict({"texts" : x_test, "labels" : y_test})

In [7]:
tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")

tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/708k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

In [8]:
tokenizer_shit = lambda shit: tokenizer(shit["texts"], padding="max_length", truncation=True)

In [9]:
train_dataset = train_dataset.map(tokenizer_shit, batched=True)
val_dataset = val_dataset.map(tokenizer_shit, batched=True)
test_dataset = test_dataset.map(tokenizer_shit, batched=True)

Map:   0%|          | 0/17099 [00:00<?, ? examples/s]

Map:   0%|          | 0/1166 [00:00<?, ? examples/s]

Map:   0%|          | 0/1166 [00:00<?, ? examples/s]

In [10]:
train_dataset.set_format(type="torch", columns=["input_ids", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "labels"])

In [11]:
FNet = FNetForSequenceClassification.from_pretrained("google/fnet-base", num_labels = 2)

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of FNetForSequenceClassification were not initialized from the model checkpoint at google/fnet-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
import wandb
wandb.init(project="996Wilson")

In [12]:
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save checkpoints
    evaluation_strategy="epoch",    # Evaluate every epoch
    learning_rate=5e-4,             # Learning rate
    per_device_train_batch_size=16, # Batch size per device
    num_train_epochs=10,             # Number of epochs
    save_strategy="epoch",          # Save checkpoint every epoch
    logging_dir="./logs",           # Log directory
    logging_steps=10000,               # Log every 10 steps
    load_best_model_at_end=True,     # Load the best model at the end of training
    report_to = ["wandb"],
    fp16=True
)

In [13]:
from sklearn.metrics import accuracy_score

In [19]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(axis=-1)
    acc = accuracy_score(preds, labels)
    return {"accuracy" : acc}

In [20]:
trainer = Trainer(
    model=FNet,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

In [22]:
import torch
import os

torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.701713,0.506861
2,No log,0.705062,0.506861
3,No log,0.693857,0.493139
4,No log,0.702902,0.493139
5,No log,0.694791,0.493139
6,No log,0.704546,0.506861
7,No log,0.693133,0.506861
8,No log,0.693336,0.506861
9,No log,0.694461,0.493139
10,No log,0.693056,0.506861


TrainOutput(global_step=5350, training_loss=0.7002999050817758, metrics={'train_runtime': 5955.1749, 'train_samples_per_second': 28.713, 'train_steps_per_second': 0.898, 'total_flos': 3.040869403527168e+16, 'train_loss': 0.7002999050817758, 'epoch': 10.0})