In [1]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments

import torch
import numpy as np

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from tqdm import tqdm
from ipywidgets import interact
import warnings

np.random.seed(0)
to_bool = lambda x: len(str(x)) > 0 and str(x).lower() not in ['nan','false','0']

# Rules Classification

Classify rules according to paper

Tutorial Used: [RoBERTa Classification](https://jesusleal.io/2020/10/20/RoBERTA-Text-Classification/)

- [] TODO: find the paper that classifies the rules
- [] TODO: find the paper that mentions best general hyperparameters for text classification

In [2]:
cols = ['Prescriptive', 'Restrictive', 'Post Content', 'Post Format', 'User-Related', 'Not a Rule', 'Spam, Low Quality, Off-Topic, and Reposts', 'Post Tagging & Flairing', 'Peer Engagement', 'Links & External Content', 'Images', 'Commercialization', 'Illegal Content', 'Divisive Content', 'Respect for Others', 'Brigading', 'Ban Mentioned', 'Karma/Score Mentioned']

all_data = pd.read_csv('./data_400_human/all_data.csv')
all_data = all_data[all_data['rule'].notna() & all_data['rule'].apply(lambda x: type(x) == str and len(x.strip()) > 0)]

# Convert the dataset to correct types
all_data[cols] = all_data[cols].applymap(to_bool)
all_data['subreddit'] = all_data['subreddit'].astype(str)
all_data['rule'] = all_data['rule'].astype(str)
all_data['labels'] = all_data.apply(lambda row: [float(row[col]) for col in cols], axis=1)

# Drop duplicate columns if in csv
for dcol in ['subreddit.1', 'rule.1', 'rule_description.1']:
    if dcol in all_data.columns:
        all_data.drop(dcol, axis=1, inplace=True)

train = all_data.sample(frac=0.8, random_state=0)
train.to_csv('./data_400_human/train.csv', index=False)

test = all_data.drop(train.index)
test.to_csv('./data_400_human/test.csv', index=False)

  all_data[cols] = all_data[cols].applymap(to_bool)


In [12]:
len(all_data), len(train), len(test)

(300, 240, 60)

In [3]:
train_data = Dataset.from_pandas(train)
test_data = Dataset.from_pandas(test)

# Remove from_pandas artifacts
train_data = train_data.remove_columns("__index_level_0__")
test_data = test_data.remove_columns("__index_level_0__")

In [4]:
train_data

Dataset({
    features: ['subreddit', 'rule', 'rule_description', 'Prescriptive', 'Restrictive', 'Post Content', 'Post Format', 'User-Related', 'Not a Rule', 'Spam, Low Quality, Off-Topic, and Reposts', 'Post Tagging & Flairing', 'Peer Engagement', 'Links & External Content', 'Images', 'Commercialization', 'Illegal Content', 'Divisive Content', 'Respect for Others', 'Brigading', 'Ban Mentioned', 'Karma/Score Mentioned', 'labels'],
    num_rows: 240
})

In [5]:
num_labels = len(train_data['labels'][0])
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', problem_type="multi_label_classification")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels, id2label={str(i): label for i, label in enumerate(cols)}, label2id={label: i for i, label in enumerate(cols)})

def tokenize_data(data):
    tokenized_data = tokenizer(data['rule'], padding=True, truncation=True, return_tensors='pt')
    tokenized_data['labels'] = torch.tensor(data['labels'])
    return Dataset.from_dict(tokenized_data)

train_data = tokenize_data(train_data)
test_data = tokenize_data(test_data)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir='/projects/bdata/reddit_rules_classification/models',          # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=113,                # number of warmup steps for learning rate scheduler. 6% of total steps
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/projects/bdata/reddit_rules_classification/models/logs',            # directory for storing logs
    evaluation_strategy="epoch"
)

def compute_metrics(pred) -> dict:
    # computes accuracy, f1, and loss for multilabel classification
    labels = pred.label_ids
    preds = pred.predictions
    
    pred_labels = np.where(preds > 0.5, 1, 0)
    acc = accuracy_score(labels, pred_labels)
    precision, recall, f1, support = precision_recall_fscore_support(labels, pred_labels, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'support': support,
    }
    

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    trainer.evaluate()

In [8]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Support
1,No log,0.66988,0.0,0.0,0.0,0.0,
2,No log,0.491465,0.05,0.426502,0.42668,0.432584,
3,No log,0.353371,0.066667,0.465584,0.418352,0.52809,
4,No log,0.309238,0.066667,0.465584,0.418352,0.52809,
5,No log,0.279462,0.066667,0.472758,0.443956,0.505618,
6,No log,0.254212,0.083333,0.547586,0.564859,0.544944,
7,No log,0.240125,0.066667,0.547954,0.575304,0.52809,
8,No log,0.230906,0.183333,0.581556,0.636537,0.578652,
9,No log,0.216816,0.2,0.615815,0.655948,0.595506,
10,No log,0.212695,0.216667,0.631518,0.762133,0.595506,


In [9]:
trainer.evaluate()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21269512176513672,
 'eval_accuracy': 0.21666666666666667,
 'eval_f1': 0.6315184113164831,
 'eval_precision': 0.7621326539500043,
 'eval_recall': 0.5955056179775281,
 'eval_support': None,
 'eval_runtime': 0.7888,
 'eval_samples_per_second': 76.06,
 'eval_steps_per_second': 5.071,
 'epoch': 10.0}

In [10]:
trainer.save_model()

import json
with open('/projects/bdata/reddit_rules_classification/models/train_log.json', 'w') as f:
    f.write(json.dumps({
        'train': [x for x in trainer.state.log_history if 'loss' in x],
        'eval': [x for x in trainer.state.log_history if 'eval_loss' in x],
        'final': trainer.evaluate(),
    }))

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
trainer.state.log_history

[{'eval_loss': 0.6698799729347229,
  'eval_accuracy': 0.0,
  'eval_f1': 0.0,
  'eval_precision': 0.0,
  'eval_recall': 0.0,
  'eval_support': None,
  'eval_runtime': 0.793,
  'eval_samples_per_second': 75.659,
  'eval_steps_per_second': 5.044,
  'epoch': 1.0,
  'step': 15},
 {'eval_loss': 0.49146509170532227,
  'eval_accuracy': 0.05,
  'eval_f1': 0.4265016473768692,
  'eval_precision': 0.426679885437321,
  'eval_recall': 0.43258426966292135,
  'eval_support': None,
  'eval_runtime': 0.7893,
  'eval_samples_per_second': 76.019,
  'eval_steps_per_second': 5.068,
  'epoch': 2.0,
  'step': 30},
 {'eval_loss': 0.3533714711666107,
  'eval_accuracy': 0.06666666666666667,
  'eval_f1': 0.46558398640355014,
  'eval_precision': 0.41835205992509367,
  'eval_recall': 0.5280898876404494,
  'eval_support': None,
  'eval_runtime': 0.7877,
  'eval_samples_per_second': 76.174,
  'eval_steps_per_second': 5.078,
  'epoch': 3.0,
  'step': 45},
 {'eval_loss': 0.30923786759376526,
  'eval_accuracy': 0.066666