In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/processed_news.csv")

# Check dataset structure
print(df.head())
print(df.info())

# Check class distribution
print(df["category"].value_counts())

                                               title  \
0  Church Congregation Brings Gift to Waitresses ...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone - A Friday the 13th Fan Film ...   
3  Elusive ‘Alien Of The Sea ‘ Caught By Scientis...   
4  Trump’s Genius Poll Is Complete & The Results ...   

                                             content           author  \
0  Sometimes the power of Christmas will make you...      Ruth Harris   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...     Zurich Times   
2  Never Hike Alone: A Friday the 13th Fan Film U...          Unknown   
3  When a rare shark was caught, scientists were ...  Alexander Smith   
4  Donald Trump has the unnerving ability to abil...  Gloria Christie   

  keywords                domain    category  article_length  num_keywords  \
0  Unknown               awm.com  Unverified             506             1   
1  Unknown     beforeitsnews.com  Unverified             188        

In [None]:
# Encode category labels as numbers
category_mapping = {"Fake": 0, "Real": 1, "Unverified": 2}
df["category"] = df["category"].map(category_mapping)


This tokenizes the text, applies padding/truncation, and converts it into input tensors for training the BERT model. 
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text
encodings = tokenizer(
    list(df["content"]),
    truncation=True, 
    padding=True,
    max_length=512,
    return_tensors="pt")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(encodings["input_ids"], df["category"], test_size=0.2, random_state=42)