In this competition we have tried different approaches in order to properly rank from the validation comparisons.

The idea is to create some features from text using a model such as Detoxify and then use them to develop a ranking model. Since in validation we have two texts in comparison, we just make the difference between the features of the two texts (t1 - t2) and then train a model to figure out if the more toxic has been subtractred from the less toxic or viceversa.

We use a linear support vector machine because we can later use the decision boundary with the expectation that it has maximized the distance from the boundary based on the intensity of the difference.

In the testing phase we subtract our text example with different examples from the validation texts in order. Our expectation is that by averaging the resulting decision boundaries we can obtain an average ranking comparable to other tested tests.

In [None]:
## install detoxify from dataset
!cp -r ../input/detoxify-sourcemodels/detoxify .
!pip install -q ./detoxify
!rm -r ./detoxify


## copy detoxify pretrained models and transformers configuration files from dataset to local caches
!mkdir -p  /root/.cache/torch/hub/checkpoints
!mkdir -p  /root/.cache/huggingface/transformers
!cp -r ../input/detoxify-sourcemodels/torch/hub/checkpoints /root/.cache/torch/hub
!cp -r ../input/detoxify-sourcemodels/huggingface/transformers /root/.cache/huggingface

In [None]:
# Setting environment variable TRANSFORMERS_OFFLINE=1 will tell Transformers to use local files only and will not try to look things up.
# It’s possible to run Transformers in a firewalled or a no-network environment or in a Kaggle inference kernel !
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [None]:
from detoxify import Detoxify

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GroupKFold, RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

import pickle
from tqdm import tqdm
import torch

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
sample = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
toscore = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
# frames
less_df = validation[['worker']].copy()
more_df = validation[['worker']].copy()
toscore_df = toscore[['comment_id']].copy()

In [None]:
def predict(df, comment_text, batch_size=256):
    types = ['original','unbiased', 'multilingual']
    for t in types:
        print(f"Detoxify {t}")
        detox = Detoxify(t, device='cuda')
        comment_t_pred =[]
        comment_text_iter = []
        for i in tqdm(range(0, len(comment_text), batch_size)):
            comment_text_batch = comment_text[i:i+batch_size]
            comment_t_pred.append(detox.predict(comment_text_batch))
        # Stacking batches togeter
        for i in range(len(comment_t_pred)):
            if i==0:
                dx_df = comment_t_pred[i].copy()
            else:
                for key in dx_df.keys():
                    dx_df[key] = dx_df[key] + comment_t_pred[i][key]
        # Saving results
        for key in dx_df.keys():
            df[t+'_'+key] = dx_df[key]

predict(toscore_df, toscore.text.to_list())

In [None]:
# Here we recover the Detoxify estimates on the validation from a pickle in order to save time

with open('../input/detoxify-validation-prepared/less_df.pkl', 'rb') as f:
    less_df = pickle.load(f)
    
with open('../input/detoxify-validation-prepared/more_df.pkl', 'rb') as f:
    more_df = pickle.load(f)

In [None]:
torch.cuda.empty_cache()

Since we are using a LinearSVC we need to strandardize the features for a better result.

In [None]:
ss = StandardScaler()
ss.fit(less_df.append(more_df).iloc[:,1:])

less_df.iloc[:,1:] = ss.transform(less_df.iloc[:,1:])
more_df.iloc[:,1:] = ss.transform(more_df.iloc[:,1:])
toscore_df.iloc[:,1:] = ss.transform(toscore_df.iloc[:,1:])

We make 500 comparisons in order to extract enough rankings to average

In [None]:
cv_score = []
cv_ranks = []
result_x_list =[]
scores = []
kf = RepeatedKFold(n_splits=5, n_repeats=120)

for k, (train_index, test_index) in enumerate(kf.split(validation)):
    print(f"\nCV FOLD {k}")
    best_accuracy = 0.0
    best_c = 1.0
    
    # Creating factual and counter-factual comparisons
    X =  np.vstack([(more_df.iloc[train_index, 1:] - less_df.iloc[train_index, 1:]),
    (less_df.iloc[train_index, 1:] - less_df.iloc[train_index, 1:])])
    y = [1] * len(train_index) + [0] * len(train_index)
    
    # Tuning a LinearSVC to correctly classify comparisons
    for c_value in [1000, 100, 10, 1, 0.1, 0.01, 0.001]:
        svc = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=c_value, fit_intercept=False)
        cv_scores = cross_val_score(svc, X, y, scoring='accuracy', cv=3, n_jobs=-1)
        accuracy = np.mean(cv_scores)
        print(f"With C={c_value} ranking achieves {accuracy:0.3f} accuracy")

        if accuracy > best_accuracy:
            best_c = c_value
            best_accuracy = accuracy
    
    # Retraining on all the data with the best C
    svc = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=best_c, fit_intercept=False)
    svc.fit(X, y)
    
    # For CV purposes we compare with a random example from the validation
    ms_c = less_df.iloc[np.random.choice(range(len(less_df))), 1:]
    ls_c = more_df.iloc[np.random.choice(range(len(less_df))), 1:]
    ms = svc.decision_function(more_df.iloc[test_index, 1:] - ms_c)
    ls = svc.decision_function(less_df.iloc[test_index, 1:] - ls_c)

    score = np.sum(ms > ls) / len(ms)
    scores.append(score)
    print(f"FOLD {k}: validation accuracy = {score:0.5f}")
    
    # In testing phase we compare with a random example from validation
    if np.random.random() > 0.5:
        cv_ranks.append(svc.decision_function(toscore_df.iloc[:, 1:] - ms_c))
    else:
        cv_ranks.append(svc.decision_function(toscore_df.iloc[:, 1:] - ls_c)) 


Our reference for checking the CV score is the mean accuracy on the validation comparison between two tests

In [None]:
print(f"Average cv score: {np.mean(scores):0.5f}")

In [None]:
# We average the random example rankings in order to obtain a general ranking
ranks = np.vstack(cv_ranks).mean(axis=0)
ranks = (ranks - np.min(ranks)) / (np.max(ranks) - np.min(ranks))
sub = pd.DataFrame({'comment_id': toscore.comment_id, 'score': ranks})
sub.to_csv("submission.csv", index=False)