Fake News Detection using BERT

In [1]:
#%pip install -q -U transformers datasets evaluate accelerate sentencepiece scikit-learn==1.6.1

In [2]:
import torch
print("Is CUDA available? ", torch.cuda.is_available())
print("Current device: ", torch.cuda.current_device())
print("Device name: ", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

Is CUDA available?  True
Current device:  0
Device name:  NVIDIA GeForce GTX 1650 Ti


In [3]:
import sklearn
print(sklearn.__version__)


1.7.1


In [4]:
import transformers
print(transformers.__version__)

4.55.4


Dataset Defination

In [5]:
from datasets import load_dataset

# Binary LIAR (True/False) prepared by UKPLab
ds = load_dataset("UKPLab/liar")
# Splits: train/validation/test already provided
print(ds)
ds["train"][0]


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 10269
    })
    validation: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 1284
    })
    test: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 1283
    })
})


{'text': 'Dwayne Bohac says the Annies List political group supports third-trimester abortions on demand.',
 'label_text': 'false statement',
 'labels': 1,
 'context': 'a mailer'}

In [6]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 256

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=max_length)

tokenized = ds.map(tokenize, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in ["text","labels"]])
tokenized = tokenized.rename_column("labels", "label")  # Trainer expects 'label' by default
tokenized.set_format(type="torch")


Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

Training and Evalution

In [None]:
import evaluate
from transformers import DataCollatorWithPadding

id2label = {0: "true", 1: "false"}    
label2id = {"true": 0, "false": 1}

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    import numpy as np
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import TrainingArguments

batch_size = 16

args = TrainingArguments(
    output_dir="D:\\Codding\\NLP_FN_proj\\fake_news_model",
    eval_strategy="epoch",       
    save_strategy="epoch",             
    logging_strategy="steps",          
    logging_steps=50,                  
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  
    fp16=True,                         
    save_total_limit=2                 
)

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    processing_class=tokenizer,   
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Saving the model

In [11]:
trainer.train()
eval_res = trainer.evaluate(tokenized["test"])
eval_res
trainer.save_model("D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model")
tokenizer.save_pretrained("D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model")


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5734,0.552263,0.739875,0.666222
2,0.5184,0.532908,0.746885,0.713864
3,0.3893,0.591659,0.753115,0.732302


('D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model\\tokenizer_config.json',
 'D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model\\special_tokens_map.json',
 'D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model\\vocab.txt',
 'D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model\\added_tokens.json',
 'D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model\\tokenizer.json')

In [1]:
'''
from transformers import pipeline

model_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

def predict_fake_news(statement):
    result = model_pipeline(statement)[0]
    label = result['label'].lower()  # convert to lowercase for consistency
    confidence = round(result['score'], 3)
    return label, confidence

'''
from transformers import pipeline

clf = pipeline(
    "text-classification",
    model="D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model",
    tokenizer="D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model"
)

print(clf("The president was born in Kenya."))
print(clf("NASA successfully launched the Artemis rocket to the moon."))


Device set to use cuda:0


[{'label': 'false', 'score': 0.6564114093780518}]
[{'label': 'true', 'score': 0.5660349130630493}]


Google Fact Check API Testing

In [None]:
import requests

API_KEY = "AIzaSyAGNkA7BV4utMlVz25EoQVwx_2gJdktdkc"
ENDPOINT = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

def google_fact_check(statement):
    params = {"query": statement, "key": API_KEY, "languageCode": "en"}
    response = requests.get(ENDPOINT, params=params)

    if response.status_code == 200:
        return response.json().get("claims", [])
    else:
        return {"error": response.text}

# Example test
results = google_fact_check("COVID-19 originated in a lab in Wuhan")
for claim in results:
    print(f"Claim: {claim.get('text')}")
    if "claimReview" in claim:
        review = claim["claimReview"][0]
        print(f"Rating: {review.get('textualRating')}")
        print(f"Source: {review.get('publisher', {}).get('name')}")
        print(f"URL: {review.get('url')}")
        print("-" * 50)


WEB APP

In [None]:
!pip -q install streamlit cloudflared

In [None]:
%%writefile /content/app.py
import os
import streamlit as st
import requests
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# ----------------------------
# STREAMLIT CONFIG
# ----------------------------
st.set_page_config(page_title="Fake News Detector (BERT)", page_icon="ðŸ“°", layout="centered")

st.title("ðŸ“° Fake News Detector (BERT)")
st.caption("Model: bert-base-uncased fine-tuned on UKPLab/liar (binary True/False) with Google Fact Check Lookup")

# ----------------------------
# LOAD MODEL (CACHE FOR SPEED)
# ----------------------------
@st.cache_resource
def load_model():
    model_dir = "D:\\Codding\\NLP_FN_proj\\fake_news_model\\best_model"  # path to your saved model
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
    id2label = model.config.id2label
    return pipe, id2label

pipe, id2label = load_model()

# ----------------------------
# GOOGLE FACT CHECK API CONFIG
# ----------------------------
API_KEY = "AIzaSyAGNkA7BV4utMlVz25EoQVwx_2gJdktdkc"  # <-- Replace with your key from Google Cloud
ENDPOINT = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

def google_fact_check(statement):
    params = {"query": statement, "key": API_KEY, "languageCode": "en"}
    response = requests.get(ENDPOINT, params=params)
    if response.status_code == 200:
        return response.json().get("claims", [])
    else:
        return []

def get_fact_check_summary(claims):
    if not claims:
        return None
    claim = claims[0]
    text = claim.get("text", "No claim text found.")
    if "claimReview" in claim:
        review = claim["claimReview"][0]
        rating = review.get("textualRating", "No rating provided")
        publisher = review.get("publisher", {}).get("name", "Unknown publisher")
        url = review.get("url", "#")
        return f"**Claim:** {text}\n**Rating:** {rating}\n**Publisher:** {publisher}\n", url
    return None

# ----------------------------
# FORM FOR USER INPUT
# ----------------------------
with st.form("detect"):
    txt = st.text_area(
        "Paste a claim, headline, or short article snippet:",
        height=140,
        placeholder="e.g., 'Government confirms aliens landed in 2025...'",
        help="Shorter claims work best on LIAR-style data."
    )
    submitted = st.form_submit_button("Analyze")

# ----------------------------
# PREDICTION & FACT CHECK LOGIC
# ----------------------------
if submitted and txt.strip():
    with st.spinner("Analyzing with BERT model..."):
        out = pipe(txt)[0]
    label = out["label"].lower()  # "true" or "false"
    score = out["score"]

    st.markdown(f"### Prediction: **{label.upper()}**")
    st.progress(float(score))
    st.write(f"Confidence: {score:.2%}")

    if label == "false":
        st.error("This statement is likely FALSE. No fact-check lookup performed.")
    else:
        st.info("Model predicts TRUE. Checking trusted fact-check databases...")
        with st.spinner("Looking up sources..."):
            claims = google_fact_check(txt)
        summary = get_fact_check_summary(claims)

        if summary:
            content, url = summary
            st.success(content)
            st.markdown(f"[Click here to read the full fact check]({url})")
        else:
            st.warning("No verified sources found. This statement may be **misleading or unverifiable**.")

else:
    st.write("Enter some text and click **Analyze**.")

'''
%%writefile /content/app.py
import os
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

st.set_page_config(page_title="Fake News Detector (BERT)", page_icon="ðŸ“°", layout="centered")

st.title("ðŸ“° Fake News Detector (BERT)")
st.caption("Model: bert-base-uncased fine-tuned on UKPLab/liar (binary True/False)")

@st.cache_resource
def load_model():
    model_dir = "D:\\Codding\\NLP_FN_proj\\fake_news_model/best_model"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
    id2label = model.config.id2label
    return pipe, id2label

pipe, id2label = load_model()

with st.form("detect"):
    txt = st.text_area("Paste a claim, headline, or short article snippet:", height=140, placeholder="e.g., 'Government confirms aliens landed in 2025...'", help="Shorter claims work best on LIAR-style data.")
    submitted = st.form_submit_button("Analyze")

if submitted and txt.strip():
    with st.spinner("Analyzing..."):
        out = pipe(txt)[0]   # e.g. {'label': 'LABEL_1', 'score': 0.92} or mapped labels
    label = out["label"]
    score = out["score"]
    st.markdown(f"### Prediction: **{label}**  ")
    st.progress(float(score))
    st.write(f"Confidence: {score:.2%}")
    st.info("Note: This is a text-only classifier; it does not replace professional fact-checking.")
else:
    st.write("Enter some text and click **Analyze**.")
'''

In [None]:
%pip install -q streamlit pyngrok

In [None]:
from pyngrok import ngrok

# paste your token here (replace "YOUR_TOKEN")
!ngrok config add-authtoken 2ull7xbfsAWkecHHzGb9LD5s1YJ_SHgqxyxac9cna7k239pj

# Run Streamlit in the background
!streamlit run /content/app.py --server.port 8501 &>/content/streamlit.log &

from pyngrok import ngrok

public_url = ngrok.connect(8501)
print("Streamlit URL:", public_url)


In [None]:
!pip freeze > requirements.txt