In [1]:
!pip install pandas transformers torch datasets scikit-learn joblib



In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import joblib

In [None]:
# Load training data
train_df = pd.read_csv('train.csv')

# Handle missing values and combine text
train_df['title'] = train_df['title'].fillna('').astype(str)
train_df['content'] = train_df['content'].fillna('').astype(str)
train_df['text'] = train_df['title'] + ' ' + train_df['content']

# Encode labels
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['target'])
num_labels = len(le.classes_)

# Save label encoder for inference
joblib.dump(le, 'label_encoder.pkl')

# Create Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])

In [None]:
# ---------------------- 🔐 Hugging Face Token Setup ----------------------
# 1. Go to https://huggingface.co/join to create a free account (if you don't have one).
# 2. Then go to https://huggingface.co/settings/tokens
# 3. Click "New token", choose the role (e.g., read), and copy the token.
# 4. In your notebook, run the following code to log in:
# from huggingface_hub import login
# login()  # Paste your token when prompted
#
# Once logged in, you can load private models, push to the hub, or use hosted models securely.

In [None]:
#from huggingface_hub import login
#login()

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('mental/mental-roberta-base')
# Tokenize function
def tokenize_function(examples):
    texts = [str(text) for text in examples['text']]  # Ensure string conversion
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/22151 [00:00<?, ? examples/s]

In [6]:
# Split into train and validation
split_dataset = tokenized_train.train_test_split(test_size=0.2)
train_data = split_dataset['train']
val_data = split_dataset['test']

In [7]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    'mental/mental-roberta-base',
    num_labels=num_labels
)

config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy='epoch',  # Updated from evaluation_strategy
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
)

# Define metrics computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
)

In [18]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7323,0.702474,0.756263,0.756798,0.76227,0.756263
2,0.5592,0.68959,0.756037,0.759881,0.772282,0.756037
3,0.3854,0.675746,0.783345,0.780964,0.781996,0.783345




TrainOutput(global_step=1662, training_loss=0.5627530587278979, metrics={'train_runtime': 3228.2693, 'train_samples_per_second': 16.467, 'train_steps_per_second': 0.515, 'total_flos': 1.398736045338624e+16, 'train_loss': 0.5627530587278979, 'epoch': 3.0})

In [None]:
# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

# Load test data
test_df = pd.read_csv('test.csv')

# Handle missing values and combine text
test_df['title'] = test_df['title'].fillna('').astype(str)
test_df['content'] = test_df['content'].fillna('').astype(str)
test_df['text'] = test_df['title'] + ' ' + test_df['content']

# Create Hugging Face dataset
test_dataset = Dataset.from_pandas(test_df[['text', 'id']])

# Tokenize test data
tokenized_test = test_dataset.map(tokenize_function, batched=True).remove_columns(['id', 'text'])

# Load saved model
model = AutoModelForSequenceClassification.from_pretrained('./bertweet_finetuned')

# Initialize Trainer for inference
trainer = Trainer(model=model)

# Predict
predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=-1)

# Decode labels
le = joblib.load('label_encoder.pkl')
test_df['predicted_target'] = le.inverse_transform(pred_labels)

# Save predictions
test_df[['id', 'predicted_target']].to_csv('submission.csv', index=False)