In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer,BertForSequenceClassification,Trainer, TrainingArguments

In [9]:
df = pd.read_csv("../data/processed_news.csv")

# Check dataset structure
print(df.head())
print(df.info())

# Check class distribution
print(df["category"].value_counts())

                                               title  \
0  Church Congregation Brings Gift to Waitresses ...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone - A Friday the 13th Fan Film ...   
3  Elusive ‘Alien Of The Sea ‘ Caught By Scientis...   
4  Trump’s Genius Poll Is Complete & The Results ...   

                                             content           author  \
0  Sometimes the power of Christmas will make you...      Ruth Harris   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...     Zurich Times   
2  Never Hike Alone: A Friday the 13th Fan Film U...          Unknown   
3  When a rare shark was caught, scientists were ...  Alexander Smith   
4  Donald Trump has the unnerving ability to abil...  Gloria Christie   

  keywords                domain    category  article_length  num_keywords  \
0  Unknown               awm.com  Unverified             506             1   
1  Unknown     beforeitsnews.com  Unverified             188        

In [10]:
# Encode category labels as numbers
category_mapping = {"Fake": 0, "Real": 1, "Unverified": 2}
df["category"] = df["category"].map(category_mapping)


# This tokenizes the text, applies padding/truncation, and converts it into input tensors for training the BERT model. 
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(encodings["input_ids"], df["category"], test_size=0.2, random_state=42)

In [11]:
# Custom dataset class for BERT
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            max_length=self.max_len,  # Pad & truncate to max length
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Return attention masks
            return_tensors='pt',  # PyTorch tensors
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [12]:
# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 classes: Fake, Real, Unverified

# Move model to GPU if available "MPS" for mac "cuda" for NVIDIA
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save the model after each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    logging_dir="./logs",            # Directory for logs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                     # BERT model
    args=training_args,              # Training arguments
    train_dataset=train_dataset,     # Training dataset
    eval_dataset=test_dataset        # Evaluation dataset
)



NameError: name 'train_dataset' is not defined

In [None]:
# Train the model
trainer.train()

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,labels,label_ids,label.