In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


In [None]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/fakenews/Fake_train.csv')
test_df = pd.read_csv('/kaggle/input/fake-test-without-labels/Fake_test_without_labels.csv')

In [None]:
# Preprocessing
def preprocess_text(text):
    return text.strip()

In [None]:
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [None]:
# Encode labels
label_mapping = {'Fake': 0, 'original': 1}
train_df['label'] = train_df['label'].map(label_mapping)

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx] if self.labels is not None else -1
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].tolist(),
    train_df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('google/muril-large-cased')
model = AutoModelForSequenceClassification.from_pretrained('google/muril-large-cased', num_labels=2)

# Create datasets
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer, max_length=128)


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
# Train the model
trainer.train()

# Evaluate the model
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

In [None]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
# Predict on test dataset
test_dataset = FakeNewsDataset(test_df['text'].tolist(), None, tokenizer, max_length=128)
predictions = trainer.predict(test_dataset)
test_preds = predictions.predictions.argmax(-1)

In [None]:
# Map predictions back to labels
test_df['label'] = test_preds
test_df['label'] = test_df['label'].map({0: 'Fake', 1: 'original'})

In [None]:
# Save predictions to CSV
test_df[['Id', 'label']].to_csv('prediction.csv', index=False)

print("Predictions saved to prediction.csv")
