In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

In [4]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess the data
reduction_factor = 0.5
fake_df = pd.read_csv('/content/processed_fake_sample.csv', quoting=3, on_bad_lines='skip')
true_df = pd.read_csv('/content/processed_true_sample.csv', quoting=3, on_bad_lines='skip')

# Add labels
fake_df['label'] = 0
true_df['label'] = 1

# Reduce dataset size
new_fake_len = int(len(fake_df) * reduction_factor)
new_true_len = int(len(true_df) * reduction_factor)
reduced_fake_df = fake_df.head(new_fake_len)
reduced_true_df = true_df.head(new_true_len)

In [6]:
# Combine and shuffle
reduced_data = pd.concat([reduced_fake_df, reduced_true_df]).reset_index(drop=True)
text_column = "text"  # Change this to the actual text column name in your dataset
label_column = "label"

X = reduced_data[text_column].dropna()
y = reduced_data[label_column][X.index]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [7]:
# Tokenize the data
def tokenize_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)

train_labels = torch.tensor(y_train.values, dtype=torch.long)
test_labels = torch.tensor(y_test.values, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define optimizer, loss, and training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

epochs = 3

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader)}")



Epoch 1/3, Loss: 0.20036473162039076
Epoch 2/3, Loss: 0.16959204717517593
Epoch 3/3, Loss: 0.1652290070386127


In [8]:
# Evaluation loop
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Compute accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 93.86%


In [11]:
!pip install scikit-learn
# Make sure to import classification_report
from sklearn.metrics import classification_report # This line imports the necessary function

# ... (rest of your code) ...

# Classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Fake", "True"]))


Classification Report:
              precision    recall  f1-score   support

        Fake       0.94      1.00      0.97       321
        True       0.00      0.00      0.00        21

    accuracy                           0.94       342
   macro avg       0.47      0.50      0.48       342
weighted avg       0.88      0.94      0.91       342



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
