In [1]:
!pip install transformers datasets




In [3]:
!pip install transformers datasets




In [None]:
import torch
from transformers import RobertaTokenizer, pipeline
import pandas as pd

# Load Data
df = pd.read_csv("/kaggle/input/welfake/WELFake_Dataset.csv")

#  Data Cleaning
df['text'] = df['text'].fillna('').astype(str)
df['label'] = df['label'].fillna(0).astype(int)

#  Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# sentiment Analysis 
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    tokenizer=tokenizer,
    truncation=True,         
    max_length=512,          
    padding=True             
)

# sentiment Prediction Function with Batching
def add_sentiment_labels(texts, batch_size=32):
    sentiment_labels = []
    sentiment_scores = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size].tolist()
        sentiment_results = sentiment_pipeline(batch_texts)

        sentiment_labels.extend([result['label'] for result in sentiment_results])
        sentiment_scores.extend([result['score'] for result in sentiment_results])

    # Map labels to numeric values 
    sentiment_map = {'LABEL_0': 0, 'LABEL_1': 1, }
    sentiment_labels = [sentiment_map[label] for label in sentiment_labels]

    return sentiment_labels, sentiment_scores

# Preprocessing Function
def preprocess_data(texts, labels):
    encodings = tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    labels = torch.tensor(labels.values)
    return encodings, labels

# sentiment Analysis Integration
sentiment_labels, sentiment_scores = add_sentiment_labels(df['text'])
df['sentiment_label'] = sentiment_labels
df['sentiment_score'] = sentiment_scores

train_texts, train_labels = df['text'], df[['label', 'sentiment_label']].values
train_encodings, train_labels = preprocess_data(train_texts, train_labels)

torch.save((train_encodings, train_labels), "/kaggle/working/train_data.pth")
print("✅ Preprocessing complete and saved as 'train_data.pth' with sentiment labels included.")


✅ Preprocessing complete. Data saved for training.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

#  Load Processed Data
train_encodings, train_labels = torch.load("/kaggle/working/train_data.pth")
val_encodings, val_labels = torch.load("/kaggle/working/val_data.pth")

#  Prepare Datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

#  Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

#  Training Setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

#  Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.logits.argmax(1) == labels).sum().item()
        total += labels.size(0)

    train_accuracy = correct / total
    print(f"✅ Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {train_accuracy:.4f}")

#  Save Model as .pth
torch.save(model.state_dict(), "/kaggle/working/roberta_model.pth")
print("✅ Model training complete. Saved as 'roberta_model.pth'")
