# Public LB Score : 0.782 
# [Inference Notebook : https://www.kaggle.com/adldotori/huggingface-distilbertclassification-inference](https://www.kaggle.com/adldotori/huggingface-distilbertclassification-inference)

In [None]:
import pandas as pd
import numpy as np
import os.path as osp
from tqdm.notebook import tqdm

In [None]:
INPUT_PATH = '/kaggle/input/jigsaw-toxic-severity-rating/'
OUTPUT_PATH = 'checkpoints/'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

# Use "Toxic Comment Classfication Challenge" Train Data

In [None]:
train = pd.read_csv('train.csv')
train.head()

In [None]:
# train['target'] = train['toxic']

train['severe_toxic'] = train.severe_toxic * 2
train['target'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1).astype(int)
train['target'] = train['target']/train['target'].max()

train['target'] = train['target'].where(train['target'] > 0, 1)
train['target'] = train['target'].astype(int)

# Toxic Rate

In [None]:
import plotly.express as px
fig = px.pie(
    train.toxic, 
    values=train.toxic.value_counts().values, 
    names=["nontoxic", "toxic"],
    width=500,
    height=500
)
fig.update_layout(
    showlegend=False,
    title="Toxic Rate"
)
fig.update_traces(textinfo='label+percent')
fig.show()

# Dataset

In [None]:
train_texts = []
train_labels = []

for i in tqdm(range(len(train))):
    train_texts.append(train.iloc[i]['comment_text'])
    train_labels.append(train.iloc[i]['target'])

In [None]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = \
train_test_split(
    train_texts, 
    train_labels, 
    test_size=.2,
    random_state=RANDOM_SEED,
    shuffle=False
)

In [None]:
print(f'Train Example: \n text: {train_texts[0]}\n label: {train_labels[0]}\n\n')
print(f'Val Example: \n text: {test_texts[0]}\n label: {test_labels[0]}\n\n')

## Tokenizer

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

## Torch Dataset

In [None]:
import torch

class JigSawDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JigSawDataset(train_encodings, train_labels)
test_dataset = JigSawDataset(test_encodings, test_labels)

# Fine-tuning with Trainer

## wandb setting

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

In [None]:
wandb.init(project="jigsaw-toxic-severity-rating", entity="taeho")
%env WANDB_LOG_MODEL=true

## download pretrained model

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_strategy="no",              # save strategy
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
    report_to="wandb"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

## training

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # test dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(OUTPUT_PATH)

# Evaluate

In [None]:
trainer.evaluate()

## Inference

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = DistilBertForSequenceClassification.from_pretrained("path_to_checkpoint").to(device)

In [None]:
sample_submission = pd.read_csv(osp.join(INPUT_PATH, 'sample_submission.csv'))
comments_to_score = pd.read_csv(osp.join(INPUT_PATH, 'comments_to_score.csv'))
comments_to_score['score'] = 0
comments_to_score.head()

In [None]:
x = 0
for i in tqdm(range(len(comments_to_score))):
    input = tokenizer.encode(comments_to_score.iloc[i]['text'], return_tensors="pt").to(device)
    output = model(input[:, :512])[0]
    predictions = torch.softmax(output, dim=1)
    comments_to_score.loc[i, 'score'] = predictions[0][1].item()

In [None]:
comments_to_score['score'] = comments_to_score['score'].rank(method='first')

In [None]:
comments_to_score.sort_values('score')

Upside is nontoxic text, downside is toxic text.

In [None]:
sample_submission['score'] = comments_to_score.sort_values('comment_id')['score']

In [None]:
sample_submission