<a href="https://colab.research.google.com/github/Vignesh-P-C/fake-news-detection-transformers/blob/main/notebooks_04_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pipeline Preparation

**Project:** Fake News Detection using Transformers  
**Goal:** Convert raw news text into model-ready tokenized inputs  
**Note:** Includes truncation, padding, and dataset preparation


In [None]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
MAX_LENGTH = 256


In [None]:
fake_df = pd.read_csv(
    "Fake.csv",
    engine="python",
    sep=",",
    quotechar='"',
    escapechar="\\",
    on_bad_lines="skip"
)

true_df = pd.read_csv(
    "True.csv",
    engine="python",
    sep=",",
    quotechar='"',
    escapechar="\\",
    on_bad_lines="skip"
)

fake_df["label"] = 0
true_df["label"] = 1

df = pd.concat([fake_df, true_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [None]:
def tokenize_texts(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

In [None]:
train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)


In [None]:
len(train_encodings["input_ids"][0])

In [None]:
len(val_encodings["input_ids"])
len(train_encodings["input_ids"])

In [None]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

In [None]:
len(train_dataset), len(val_dataset)


In [None]:
train_dataset[0]