# Load Data

In [1]:
import pandas as pd

X_train = pd.read_csv('../data/processed/splits transformed/X_train_clean.csv', quoting= 1)
X_test = pd.read_csv('../data/processed//splits transformed/X_test_clean.csv', quoting=1)

y_train = pd.read_csv('../data/processed//splits/y_train.csv')
y_test = pd.read_csv('../data/processed//splits/y_test.csv')

# Preprocessing
- By default, no cleaning required for Transformer Based Models

In [None]:
# cleaning
def clean_text(text):
    return text

# train set cleaning
X_train['transformed_title'] = X_train['title'].apply(clean_text)
X_train['transformed_text'] = X_train['text'].apply(clean_text)
X_train['transformed_text_title_combined'] = X_train['transformed_text'] + " " + X_train['transformed_title']

# test set cleaning
X_test['transformed_title'] = X_test['title'].apply(clean_text)
X_test['transformed_text'] = X_test['text'].apply(clean_text)
X_test['transformed_text_title_combined'] = X_test['transformed_text'] + " " + X_test['transformed_title']

In [21]:
X_train.sample()

Unnamed: 0,title,text,transformed_title,transformed_text,transformed_text_title_combined
18202,WATCH: PARTY GIRL MALIA OBAMA Caught On Camera...,B b but Baron Trump wore a t-shirt that said ...,watch: party girl malia obama caught on camera...,b b but baron trump wore a t-shirt that said ...,b b but baron trump wore a t-shirt that said ...


In [6]:
X_test.sample()

Unnamed: 0,title,text,transformed_title,transformed_text,transformed_text_title_combined
31041,Clinton's charity confirms Qatar's $1 million ...,NEW YORK (Reuters) - The Clinton Foundation ha...,clinton's charity confirms qatar's $1 million ...,new york (reuters) - the clinton foundation ha...,new york (reuters) - the clinton foundation ha...


# Transformer Based Model - DistilBERT

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
import torch

# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# 2. Convert your dataset into Hugging Face Dataset format
# Assume X_train, y_train, X_test, y_test are pandas Series
train_ds = Dataset.from_dict({"text": list(X_train['transformed_text_title_combined']), "label": y_train['label'].tolist()})
test_ds = Dataset.from_dict({"text": list(X_test['transformed_text_title_combined']), "label": y_test['label'].tolist()})

# 3. Tokenization
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

# 4. Load pre-trained DistilBERT for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

# 5. Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
        learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
)

# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 8. Train (Fine-Tuning)
trainer.train()

# 9. Save fine-tuned model
trainer.save_model("distilbert-fake-news")


Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0153,0.00437
2,0.0006,0.000723
3,0.0001,0.000929




In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Re-create trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate on test set
results = trainer.evaluate()
print(results)

  trainer = Trainer(


{'eval_loss': 0.0007231914787553251, 'eval_accuracy': 0.9998886414253898, 'eval_precision': 1.0, 'eval_recall': 0.9997665732959851, 'eval_f1': 0.999883273024396, 'eval_runtime': 45.6446, 'eval_samples_per_second': 196.737, 'eval_steps_per_second': 6.156}


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model + tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-fake-news")
tokenizer = AutoTokenizer.from_pretrained("distilbert-fake-news")

# Put model in eval mode
model.eval()

# Example news texts
samples = [
    """
    Israel has stepped up its destruction of Gaza City as it plans to seize Gaza’s largest urban centre and forcibly displace around one million Palestinians to concentration zones in the south, as it killed at least 78 people across the besieged enclave since dawn, including 32 desperately seeking food.
    On Sunday, in Gaza City, the Palestinian Civil Defence reported a fire in tents near al-Quds Hospital after Israeli shelling. At least five people were killed and three wounded when a residential apartment was hit near the Remal neighbourhood.
    """, 
    "Shocking news! Alien spaceship lands in New York and government covers it up.",
    "President announces major healthcare reforms to support low-income families.",
    "You won’t believe this! Miracle cure for cancer discovered in secret lab.",  
    """
    Hamas used sexual violence as "part of a deliberate genocidal strategy" during the 7 October 2023 attack on Israel, an all-women group of Israeli legal and gender experts allege in a new report calling for justice.
    The Dinah Project says the report is based on a review of evidence including first-hand testimony from a survivor of an attempted rape and 15 former hostages held in Gaza, as well as accounts from witnesses to sexual assaults.
    It lays out what the group describes as "a legal blueprint for prosecuting these crimes, even when direct attribution to individual perpetrators is impossible".
    """,
    """
    The Dinah Project says the accounts from people who saw or heard incidents of sexual violence showed that such crimes were "widespread and systematic" on 7 October.
    According to the report, five witnesses reported at least four separate cases of gang rape; seven reported at least eight other separate cases of rape or severe sexual assaults, some of them in captivity; five reported at least three separate cases of sexual assaults, some in captivity; and three reported three separate cases of mutilation.
    Nine of those cases related to the Nova music festival, two to the Nahal Oz military base, one to the Route 232 road, and four to incidents occurring in captivity in Gaza, the report says.
    Twenty-seven first responders meanwhile described dozens of cases which showed "clear signs of sexual violence across six locations", the report says - the Nova festival, Route 232, and the kibbutzim of Be'eri, Alumim, Nahal Oz and Re'im.
    The report also says that "most victims were permanently silenced", because they were either killed on 7 October or left too traumatised to talk.
    In response, the authors provide what they describe as the "first global legal blueprint explaining how to prosecute sexual violence as a weapon of war - even when evidence is messy, survivors are gone, and individual perpetrators can't be tied to individual acts".
    That includes an evidentiary framework to categorise information based on its proximity to incidents and its evidentiary value, and a legal framework for establishing criminal responsibility for atrocities committed during mass attacks, even when an individual did not personally commit each specific act or were not aware of its commission by someone else.
    The report concludes by saying that justice is "essential not only for individual victims but for affirming broader principles: that sexual violence in conflict is a serious violation of international law, that perpetrators will be held accountable, and that the international community will not allow such crimes to be committed with impunity".
    """
]

labels = ["Fake", "Real"]

for text in samples:
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    print(f"Text: {text}")
    print("Prediction:", labels[predicted_class_id])
    print("-" * 100)

Text: 
    Israel has stepped up its destruction of Gaza City as it plans to seize Gaza’s largest urban centre and forcibly displace around one million Palestinians to concentration zones in the south, as it killed at least 78 people across the besieged enclave since dawn, including 32 desperately seeking food.
    On Sunday, in Gaza City, the Palestinian Civil Defence reported a fire in tents near al-Quds Hospital after Israeli shelling. At least five people were killed and three wounded when a residential apartment was hit near the Remal neighbourhood.
    
Prediction: Real
----------------------------------------------------------------------------------------------------
Text: Shocking news! Alien spaceship lands in New York and government covers it up.
Prediction: Fake
----------------------------------------------------------------------------------------------------
Text: President announces major healthcare reforms to support low-income families.
Prediction: Real
--------------