In [1]:
!pip install -U transformers --q
!pip install sentencepiece --q
!pip install tokenizers  --q

[K     |████████████████████████████████| 5.5 MB 16.3 MB/s 
[K     |████████████████████████████████| 163 kB 70.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 52.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 17.0 MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
# from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [4]:
import pandas as pd
df = pd.read_csv("5kCorpora.csv")
df

Unnamed: 0,Title,Label
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0
...,...,...
4995,U.S. officials discussing withholding some aid...,1
4996,Any U.S. military transgender ban could face m...,1
4997,"Transgender soldiers, veterans shaken by Trump...",1
4998,Massachusetts immigrant ruling could guide oth...,1


In [5]:
#df['Label'] = df['Label'].map({'Fake':0,'Real':1})

In [6]:
df = df.sample(frac=1).reset_index()
df = df[['Title','Label']]
df = df[:5000]
df.Label.value_counts()

1    2500
0    2500
Name: Label, dtype: int64

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_path = 'distilbert-base-uncased-finetuned-sst-2-english'
nli_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, num_labels=2)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [8]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
df_train.shape, df_val.shape, df_test.shape

((4500, 2), (250, 2), (250, 2))

In [9]:
def tokenize_df(tokenizer, data, has_label=True):
        #convert to list
        fake_list = df['Title'].tolist()
        #returns dictionary with keys: input_ids, attention_mask)
        encoding_dict = tokenizer(fake_list, padding="max_length", max_length=64,
                                  truncation=True, return_token_type_ids=False)
        #add 'label' key if we are making train or validation data
        if has_label: 
            encoding_dict['label'] = df['Label'].tolist()
        #convert dictionary of lists into list of dictionaries
        return [dict(zip(encoding_dict, t)) for t in zip(*encoding_dict.values())]

#apply above function to create lists to use for our data:
X_train_tokenized = tokenize_df(tokenizer, df_train)
X_val_tokenized = tokenize_df(tokenizer, df_val)
X_test_tokenized = tokenize_df(tokenizer, df_test, has_label=False)
train_dataset = X_train_tokenized
eval_dataset = X_val_tokenized
test_dataset = X_test_tokenized

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
nli_model.to(device)
print(f'Using {device}')

Using cuda


In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall}

training_args = TrainingArguments(
                output_dir="/content/drive/MyDrive/English",
                learning_rate=1e-4,
                num_train_epochs=2, 
                report_to="none",
                evaluation_strategy="steps", 
                eval_steps=600,
                save_steps=600,
                logging_steps=600,
                load_best_model_at_end=True
                )
trainer = Trainer(
    model=nli_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics)
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=4)])
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
  Number of trainable parameters = 66955010


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
600,0.1767,0.075278,0.9856,0.985431,0.997133,0.974
1200,0.0396,0.004775,0.9988,0.998801,0.998003,0.9996


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/English/checkpoint-600
Configuration saved in /content/drive/MyDrive/English/checkpoint-600/config.json
Model weights saved in /content/drive/MyDrive/English/checkpoint-600/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/English/checkpoint-1200
Configuration saved in /content/drive/MyDrive/English/checkpoint-1200/config.json
Model weights saved in /content/drive/MyDrive/English/checkpoint-1200/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/English/checkpoint-1200 (score: 0.00477455323562026).


TrainOutput(global_step=1250, training_loss=0.10516315851211548, metrics={'train_runtime': 128.3585, 'train_samples_per_second': 77.907, 'train_steps_per_second': 9.738, 'total_flos': 165584248320000.0, 'train_loss': 0.10516315851211548, 'epoch': 2.0})

In [12]:
metrics=trainer.evaluate()
print(metrics)

***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8


{'eval_loss': 0.00477455323562026, 'eval_accuracy': 0.9988, 'eval_f1': 0.9988009592326139, 'eval_precision': 0.9980031948881789, 'eval_recall': 0.9996, 'eval_runtime': 9.5969, 'eval_samples_per_second': 520.999, 'eval_steps_per_second': 65.125, 'epoch': 2.0}


In [13]:
predictions = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 5000
  Batch size = 8


In [14]:
y_pred = predictions.predictions
y_pred = [np.argmax(pred) for pred in y_pred]
y_pred[:20]

[1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0]

In [15]:
import numpy as np

In [16]:
df.head(10)

Unnamed: 0,Title,Label
0,House Speaker Ryan says special counsel should...,1
1,Sessions Too Intimidated To Answer Harris’s H...,0
2,Michigan governor denies misleading U.S. House...,1
3,The Walls Close In On Trump As His Disgraced ...,0
4,This Powerful Campaign Ad Announces A Serious...,0
5,WATCH: Trump Declares Himself One Of The Best...,0
6,"Allegations of Russian meddling in U.S., Europ...",1
7,‘Lunch Shaming’: Schools Punish Poor Kids Who...,0
8,"Putin, Trump to discuss North Korea on Tuesday...",1
9,Treasury's Mnuchin: Difficult not to cut taxes...,1


In [17]:
df.Title[8]

'Putin, Trump to discuss North Korea on Tuesday: IFX cites Kremlin aide'

In [18]:
df.Title[7]

' ‘Lunch Shaming’: Schools Punish Poor Kids Who Can’t Pay For Lunch With Appalling Humiliation (VIDEO)'

In [19]:
df.Title[3]

' The Walls Close In On Trump As His Disgraced National Security Adviser Faces Indictment From Team Mueller'

In [20]:
df.Title[1]

' Sessions Too Intimidated To Answer Harris’s Hot Questions At Hearing, So She SCORCHES Him On Twitter (Details)'

In [21]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [37]:
X_test = ['Trump Said some INSALELY Racist Stuff Inside The Oval Office, And Witnesses Back It Up']
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "/content/drive/MyDrive/English/checkpoint-600"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_predict = np.argmax(raw_pred, axis=1)

loading configuration file /content/drive/MyDrive/English/checkpoint-600/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/English/checkpoint-600",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/English/checkpoint-600/pytorch

In [38]:
y_predict

array([0])