In [1]:
import pandas as pd
import random
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import csv
import sys
import pickle

In [2]:
# Increase the field size limit to handle large text fields
csv.field_size_limit(sys.maxsize)

df_welfake = pd.read_csv('/content/processed_welfake_data.csv',
                      quotechar='"',
                      engine='python',
                      on_bad_lines='warn')

df_politifact = pd.read_csv(
    '/content/processed_politifact_data.csv',
)

In [3]:
df_welfake = df_welfake.fillna('')
df_welfake['label'] = 1 - df_welfake['label']

In [4]:
df_welfake.shape

(62719, 3)

In [5]:
df_welfake.head()

Unnamed: 0,title,clean_text,label
0,Après le succès de « Mariés au premier regard ...,,0
1,Why Hillary Clinton's Campaign Is Collapsing |...,,0
2,Anonymous: America’s Last Hope-You Have Been W...,America is at the end of her rope. Never befor...,0
3,Everyone Is Abandoning Hillary- Except for Geo...,Hillary's campaign is going down faster than s...,0
4,Just Another Day In the Life of the Clinton Cr...,"In the past 24 hours, some very stunning piece...",0


In [6]:
df_politifact = df_politifact.fillna('')

In [7]:
df_politifact.shape

(21317, 2)

In [8]:
df_politifact.head()

Unnamed: 0,clean_news,label
0,dark spots on potato chips are infected with t...,0
1,ron johnson has voted against funding for law ...,0
2,antarctica has an entrance to a different worl...,0
3,falleci el papa emrito benedicto xvi una foto ...,0
4,in 2020 more people voted in america than ever...,1


In [9]:
df_politifact['title'] = ''
df_politifact.columns = ['clean_text', 'label', 'title']

In [10]:
df_politifact.head()

Unnamed: 0,clean_text,label,title
0,dark spots on potato chips are infected with t...,0,
1,ron johnson has voted against funding for law ...,0,
2,antarctica has an entrance to a different worl...,0,
3,falleci el papa emrito benedicto xvi una foto ...,0,
4,in 2020 more people voted in america than ever...,1,


In [11]:
all_data = pd.concat([df_welfake, df_politifact], axis = 0)

In [12]:
all_data = all_data.reset_index(drop = True)

In [13]:
all_data.to_csv('final_all_data.csv', index = False)

In [14]:
labels = all_data.pop("label")

In [15]:
train_data, tmp_data, train_labels, tmp_labels = train_test_split(all_data, labels, test_size = 0.3, random_state = 1126)
val_data, test_data, val_labels, test_labels = train_test_split(tmp_data, tmp_labels, test_size = 0.5, random_state = 1126)

In [16]:
train_texts = train_data.clean_text.values
val_texts = val_data.clean_text.values
test_texts = test_data.clean_text.values

train_titles = train_data.title.values
val_titles = val_data.title.values
test_titles = test_data.title.values

In [17]:
print(train_texts.shape)
print(val_texts.shape)
print(test_texts.shape)

(58825,)
(12605,)
(12606,)


In [18]:
# Save train indices
with open('train_indices.pkl', 'wb') as f:
    pickle.dump(train_data.index.tolist(), f)

# Save validation indices
with open('val_indices.pkl', 'wb') as f:
    pickle.dump(val_data.index.tolist(), f)

# Save test indices
with open('test_indices.pkl', 'wb') as f:
    pickle.dump(test_data.index.tolist(), f)


## Modeling

In [19]:
class NewsDataset(Dataset):
  def __init__(self, titles, texts, labels, tokenizer, max_len = 512):
    self.titles = titles
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    title = str(self.titles[idx])
    text = str(self.texts[idx])
    label = self.labels[idx]

    encoding = self.tokenizer.encode_plus(
        title,
        text,
        add_special_tokens = True,
        max_length = self.max_len,
        return_token_type_ids = True,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'token_type_ids': encoding['token_type_ids'].flatten(),
        'labels': torch.tensor(label, dtype = torch.long)
    }

In [20]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [22]:
MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'roberta-base'
SEED = 1126
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
WEIGHT_DECAY  = 1e-2
EPOCHS = 4
SAVE_PATH = '/content/best_model.pth'

In [23]:
seed_everything(SEED)

In [24]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
train_y = train_labels.astype(int).values
val_y = val_labels.astype(int).values
test_y = test_labels.astype(int).values

train_dataset = NewsDataset(train_titles, train_texts, train_y, tokenizer)
val_dataset = NewsDataset(val_titles, val_texts, val_y, tokenizer)
test_dataset = NewsDataset(test_titles, test_texts, test_y, tokenizer)

In [27]:
training_args = TrainingArguments(
    output_dir = './saved_model',
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    warmup_steps = 500,
    weight_decay = WEIGHT_DECAY,
    report_to = "none",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model = "f1",
    save_total_limit = 1
)

In [28]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics
)

In [29]:
import gc
torch.cuda.empty_cache()
gc.collect()

287

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1692,0.172746,0.920904,0.926944,0.903184,0.951987
2,0.1315,0.154554,0.926537,0.928977,0.947138,0.911499
3,0.0926,0.199736,0.926458,0.930432,0.927855,0.933022
4,0.0499,0.267218,0.924554,0.927927,0.934514,0.921433


TrainOutput(global_step=7356, training_loss=0.1248097199340953, metrics={'train_runtime': 3175.4097, 'train_samples_per_second': 74.101, 'train_steps_per_second': 2.317, 'total_flos': 3.11695789037568e+16, 'train_loss': 0.1248097199340953, 'epoch': 4.0})

In [31]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.19973567128181458, 'eval_accuracy': 0.9264577548591829, 'eval_f1': 0.9304315196998124, 'eval_precision': 0.9278551115102529, 'eval_recall': 0.9330222757375075, 'eval_runtime': 75.7605, 'eval_samples_per_second': 166.38, 'eval_steps_per_second': 5.201, 'epoch': 4.0}


In [32]:
import zipfile
import os
from google.colab import files

folder_to_zip = '/content/saved_model/checkpoint-7356'
zip_file_name = 'checkpoint-7356.zip'

with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files_in_dir in os.walk(folder_to_zip):
        for file in files_in_dir:
            file_path = os.path.join(root, file)
            # Add file to zip, preserving directory structure relative to folder_to_zip
            zipf.write(file_path, os.path.relpath(file_path, folder_to_zip))

print(f'Successfully created {zip_file_name}')
files.download(zip_file_name)

Successfully created checkpoint-7356.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>