In [1]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime



In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [32]:
df = pd.read_csv("FakeNewsNet_combined.csv", header = 0)

df['date'] = df['date'].str.split(' ').str[0]

def filter_by_date_range(df, start_date, end_date):
    return df[(df['date'] >= start_date) & (df['date'] <= end_date)]


start_date = "2017-01-01"
end_date = "2018-12-31"
filtered_df = filter_by_date_range(df, start_date, end_date)


label_map = {1: 1, 0: 0}
filtered_df['label'] = filtered_df['label'].map(label_map)

value_counts = filtered_df['label'].value_counts()
print(value_counts)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    filtered_df['title'].tolist(),
    filtered_df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


label
1    7337
0    2436
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['label'] = filtered_df['label'].map(label_map)


In [33]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize data
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)


# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels,
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": test_labels,
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [34]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert-fake-news",
    run_name = "intialTestRun_BASE_BERT",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [35]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3744,0.403229,0.831714
2,0.4132,0.458003,0.843478
3,0.088,0.602943,0.848082


TrainOutput(global_step=2934, training_loss=0.3238306373215567, metrics={'train_runtime': 657.7681, 'train_samples_per_second': 35.657, 'train_steps_per_second': 4.461, 'total_flos': 1542751673103360.0, 'train_loss': 0.3238306373215567, 'epoch': 3.0})

In [36]:

results = trainer.evaluate()

print(f"Test Accuracy: {results['eval_accuracy']:.4f}")


predictions = trainer.predict(test_dataset).predictions
pred_labels = np.argmax(predictions, axis=1)

print(classification_report(test_labels, pred_labels, target_names=["Real", "Fake"]))


Test Accuracy: 0.8481
              precision    recall  f1-score   support

        Real       0.75      0.62      0.68       505
        Fake       0.88      0.93      0.90      1450

    accuracy                           0.85      1955
   macro avg       0.81      0.77      0.79      1955
weighted avg       0.84      0.85      0.84      1955



In [37]:
from transformers import BertForSequenceClassification


model.save_pretrained("fine_tuned_bert")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained("fine_tuned_bert")

('fine_tuned_bert/tokenizer_config.json',
 'fine_tuned_bert/special_tokens_map.json',
 'fine_tuned_bert/vocab.txt',
 'fine_tuned_bert/added_tokens.json')