### Explaining RoBERTa Model

In [2]:
import torch
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer,TextClassificationPipeline

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer,TextClassificationPipeline

In [7]:
from datasets import load_dataset
raw_datasets=load_dataset('csv',data_files={'train': 'train.csv',
                                              'test':'test.csv'})

Using custom data configuration default-fa31e0d1c1d52458
Reusing dataset csv (C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-fa31e0d1c1d52458\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
100%|██████████| 2/2 [00:00<00:00, 105.27it/s]


In [8]:

tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to('cuda')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-fa31e0d1c1d52458\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-29d3a313a5470509.arrow
100%|██████████| 200/200 [00:20<00:00,  9.90ba/s]


In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=10).select(range(2000)) 
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=10).select(range(200)) 
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

Loading cached shuffled indices for dataset at C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-fa31e0d1c1d52458\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-013c968df03650b2.arrow


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate = 2e-5,
    #batch_size = 32,
    warmup_steps=0 ,
    #max_seq_length = 128,
    num_train_epochs = 5.0 ,
    weight_decay=0.01 
)

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

In [None]:
trainer.train()

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

In [13]:
text1=raw_datasets['train'][1111]['text']
text2=raw_datasets['train'][2222]['text']
text3=raw_datasets['train'][3333]['text']
text4=raw_datasets['train'][4444]['text']
text5=raw_datasets['train'][5555]['text']

In [15]:
pipe=transformers.TextClassificationPipeline(tokenizer=tokenizer,model=model,device=0)

### Shap Examples

In [19]:
import shap

In [20]:
def score_and_visualize(text):
  prediction = pipe([text])
  print(prediction[0])

  explainer = shap.Explainer(pipe)
  shap_values = explainer([text])

  shap.plots.text(shap_values)

In [21]:
score_and_visualize(text1)

{'label': 'NEUTRAL', 'score': 0.5439209938049316}


In [22]:
score_and_visualize(text2)

{'label': 'NEUTRAL', 'score': 0.6882557272911072}


In [23]:
score_and_visualize(text3)

{'label': 'NEUTRAL', 'score': 0.6551351547241211}


In [24]:
score_and_visualize(text4)

{'label': 'NEUTRAL', 'score': 0.5168607234954834}


In [25]:
score_and_visualize(text5)

{'label': 'NEUTRAL', 'score': 0.5129474997520447}
