In [75]:
# Transformers installation
# ! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

# Text classification

In [76]:
from datasets import load_dataset

liar = load_dataset("liar")

In [77]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [78]:
def preprocess_function(examples):
    return tokenizer(examples["statement"], truncation=True)

In [79]:
tokenized_liar = liar.map(preprocess_function, batched=True)

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [80]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [81]:
!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")



In [82]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [83]:
label_mapping = {0: 0, 1: 0, 2: 1, 3: 1, 4: 0, 5:0}

tokenized_liar["train"] = tokenized_liar["train"].map(lambda x: {"label": label_mapping[x["label"]]})
tokenized_liar["test"] = tokenized_liar["test"].map(lambda x: {"label": label_mapping[x["label"]]})
tokenized_liar["validation"] = tokenized_liar["validation"].map(lambda x: {"label": label_mapping[x["label"]]})


Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [84]:

id2label = {0: "FALSE", 1: "TRUE"}
label2id = {"FALSE": 0, "TRUE": 1}

In [85]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [87]:
import torch
training_args = TrainingArguments(
    output_dir="/content",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_liar["train"],
    eval_dataset=tokenized_liar["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# device = torch.device('cuda')
# model = model.to(device)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.592439,0.693925
2,0.621400,0.58854,0.689252


TrainOutput(global_step=642, training_loss=0.6190581752503772, metrics={'train_runtime': 54.5649, 'train_samples_per_second': 376.396, 'train_steps_per_second': 11.766, 'total_flos': 515599001859840.0, 'train_loss': 0.6190581752503772, 'epoch': 2.0})

In [88]:
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# print('\nClassification Report: \n', classification_report(tokenized_liar["test"]["label"], (model_predictions > 0.5)))

output = trainer.predict(tokenized_liar["test"])
print(output)

predictions = []
for logi in output[0]:
  predicted_class_id = logi.argmax().item()
  predictions.append(predicted_class_id)

PredictionOutput(predictions=array([[-0.01683049, -0.8364295 ],
       [-0.1621165 , -0.49313122],
       [ 0.1122405 , -1.3199408 ],
       ...,
       [-0.05048397, -0.940371  ],
       [-0.10315879, -1.131582  ],
       [ 0.05026966, -1.0839444 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.6079329252243042, 'test_accuracy': 0.6804364770070148, 'test_runtime': 1.0375, 'test_samples_per_second': 1236.601, 'test_steps_per_second': 39.517})


In [89]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print('\nClassification Report: \n', classification_report(tokenized_liar["test"]["label"], (predictions)))



Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.85      0.77       823
           1       0.58      0.38      0.46       460

    accuracy                           0.68      1283
   macro avg       0.65      0.61      0.62      1283
weighted avg       0.66      0.68      0.66      1283



In [90]:
from transformers import AutoTokenizer
text = ["The Covid 19 Vaccine has 5G","The Covid 19 Vaccine doesn't have 5G"]
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, return_tensors="pt")
# inputs = [preprocess_function(input) for input in text]
# inputs = torch.tensor(inputs)
inputs = tokenizer.__call__(text, return_tensors="pt", padding=True)
inputs.to('cuda')

{'input_ids': tensor([[  101,  1996,  2522, 17258,  2539, 17404,  2038,  1019,  2290,   102,
             0,     0,     0],
        [  101,  1996,  2522, 17258,  2539, 17404,  2987,  1005,  1056,  2031,
          1019,  2290,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [91]:
from transformers import AutoModelForSequenceClassification
with torch.no_grad():
    logits = model(**inputs).logits

In [92]:
print(logits)
for logi in logits:
  predicted_class_id = logi.argmax().item()
  print(predicted_class_id)
  print(model.config.id2label[predicted_class_id])

tensor([[ 0.0058, -0.8897],
        [-0.1319, -0.9663]], device='cuda:0')
0
FALSE
0
FALSE


In [93]:
import pandas as pd

df = df = pd.read_csv('t5_generated_fake_news.csv')
fake_news = df['generated_description']
fakenewslist = fake_news.tolist()
inputs = tokenizer.__call__(fakenewslist, return_tensors="pt", padding=True)
inputs.to('cuda')

with torch.no_grad():
    logits = model(**inputs).logits

for logi in logits:
  predicted_class_id = logi.argmax().item()
  # print(predicted_class_id)
  print(model.config.id2label[predicted_class_id])

In [94]:
from transformers import AutoTokenizer

gpt2 = torch.load('gpt2gens.pt')
inputs = tokenizer.__call__(gpt2, return_tensors="pt", padding=True)
inputs.to('cuda')

with torch.no_grad():
    logits = model(**inputs).logits

for logi in logits:
  predicted_class_id = logi.argmax().item()
  print(model.config.id2label[predicted_class_id])

FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
TRUE
