In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=aa5de0a8de7bd16942dfe4bfe94ee6c35412cf0f4ace20c4d6fc206c50da2561
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset
import os
os.environ["WANDB_DISABLED"] = "true"
import numpy as np
import torch
from torch.utils.data import Dataset
from sentence_transformers import util
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

In [3]:
task = "rte" # recognising textual entailment (1 among the 9 GLUE tasks)
metric = load_metric("glue", task) # This will be accuracy

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [4]:
# This function is to be fed into the HuggingFace Trainer API to compute the accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
# pytorch dataset class for recognising textual entailment
class RTE_Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
# Thus us basically a pretrained "distilbert-base-uncased" finetuned for our dataset
def entailment_model(train_dataset, validation_dataset):
  print(train_dataset.shape, validation_dataset.shape)
  model_checkpoint = "distilbert-base-uncased"
  train_batch_size = 8 # Hyperparameter (can be tuned)
  val_batch_size = 1 # Hyperparameter (can be tuned)

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
  train_encodings = tokenizer(train_dataset["Tweet(Hypothesis)"].to_list(), train_dataset["RelevantSentences(Premise)"].tolist(), padding=True, truncation=True)
  validation_encodings = tokenizer(validation_dataset["Tweet(Hypothesis)"].to_list(), validation_dataset["RelevantSentences(Premise)"].tolist(), padding=True, truncation=True)

  label_dict = {"fake": 0, "real": 1}
  train_labels = train_dataset["Label"].map(label_dict).to_list()
  validation_labels = validation_dataset["Label"].map(label_dict).to_list()

  # creating the pytorch training and validation datasets from the tokenized encodings
  train_dataset_torch = RTE_Dataset(train_encodings, train_labels)
  validation_dataset_torch = RTE_Dataset(validation_encodings, validation_labels)

  # initialising the model and adding one output neural layer for classification
  num_labels = 2
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

  # training arguments to customize the training
  # consists some hyperparameters like weight decay, epochs
  training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=train_batch_size,  # batch size per device during training
    per_device_eval_batch_size=val_batch_size,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    metric_for_best_model="accuracy",
    load_best_model_at_end=True
  )

  # prints the deviceee - cuda or cpu
  print("Training device:", training_args.device)

  # using the Trainer API to specify training
  trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset_torch,         # training dataset
    eval_dataset=validation_dataset_torch,     # evaluation dataset
    compute_metrics=compute_metrics
  )

  # training the model
  trainer.train()

  # using the trained model to return the predictions object and extracting accuracy from it
  predictions = trainer.predict(test_dataset=validation_dataset_torch)
  print("Accuracy after finetuning:", predictions.metrics['test_accuracy'])

In [7]:
# This uses the 'stsb-mpnet-base-v2' pretrained model but it is not fine-tuned on our dataset
# generates embeddings and classifies entailmenr based on threshold
def entailment_without_finetuning(model, tweet, evidence_set, threshold):
    tweet_embedding = model.encode(tweet, convert_to_tensor=True)
    evidence_embedding = model.encode(evidence_set, convert_to_tensor = True)
    cosine_score = util.pytorch_cos_sim(tweet_embedding, evidence_embedding)
    similarity_score = cosine_score.item()
    if similarity_score > threshold:
        return "real"
    else:
        return "fake"

In [10]:
train_df_list = []
for i in range(460):
    train_df_i = pd.read_csv(f'../input/twitter-training-set-entailment/training_dataset_{i}.csv')
    train_df_list.append(train_df_i)

In [11]:
train_df = pd.concat(train_df_list, axis=0)
train_df.shape

(9200, 5)

In [12]:
print('Starting to train (fine-tune distilbert-base-uncased)...')
train_df = train_df.sample(frac=1).reset_index() # shuffle the dataset
# train_df.dropna(inplace=True)
train_df['Tweet(Hypothesis)'] = train_df['Tweet(Hypothesis)'].fillna("")
train_df['RelevantSentences(Premise)'] = train_df['RelevantSentences(Premise)'].fillna("")

Starting to train (fine-tune distilbert-base-uncased)...


In [17]:
print('train_df shape before train_test_split:', train_df.shape)
train_dataset, validation_dataset = train_test_split(train_df, test_size=0.25)

train_df shape before train_test_split: (9200, 6)


In [19]:
train_df['Tweet(Hypothesis)'].isna().sum(), train_df['RelevantSentences(Premise)'].isna().sum()

(0, 0)

In [20]:
entailment_model(train_dataset, validation_dataset)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}



(6900, 6) (2300, 6)


loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accef

Training device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3269,0.293721,0.898261
2,0.2192,0.24027,0.900435
3,0.1969,0.310303,0.903478
4,0.1624,0.272088,0.91087
5,0.146,0.353338,0.903478
6,0.1418,0.364585,0.905652
7,0.1275,0.392493,0.900435
8,0.1002,0.459173,0.906957
9,0.0842,0.463725,0.907826
10,0.0774,0.503498,0.906087


***** Running Evaluation *****
  Num examples = 2300
  Batch size = 1
Saving model checkpoint to ./results/checkpoint-863
Configuration saved in ./results/checkpoint-863/config.json
Model weights saved in ./results/checkpoint-863/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2300
  Batch size = 1
Saving model checkpoint to ./results/checkpoint-1726
Configuration saved in ./results/checkpoint-1726/config.json
Model weights saved in ./results/checkpoint-1726/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2300
  Batch size = 1
Saving model checkpoint to ./results/checkpoint-2589
Configuration saved in ./results/checkpoint-2589/config.json
Model weights saved in ./results/checkpoint-2589/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2300
  Batch size = 1
Saving model checkpoint to ./results/checkpoint-3452
Configuration saved in ./results/checkpoint-3452/config.json
Model weights saved in ./results/checkpoint-3452/pytorch_model.bin

Accuracy after finetuning: 0.9108695652173913
