In [None]:
##### PRAJITA #####
## USING RoBERTa ##

from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Mount Google Drive
drive.mount('/content/drive')

# Dataset Path
file_path = "/content/drive/My Drive/Dataset for AI Generated Text Detection/Dataset_for_AI_Generated_Text_Detection.csv"

# Load dataset
data = pd.read_csv(file_path)

# Display basic info about the dataset
print("Dataset Info:")
print(data.info())
print("\nLabel Distribution:")
print(data['label'].value_counts())

# Scatter plot to visualize dataset distribution
plt.figure(figsize=(8, 6))
sns.scatterplot(x=range(len(data)), y=data['label'], hue=data['label'], palette='coolwarm', alpha=0.6)
plt.title("Dataset Distribution (0: Non-AI, 1: AI)")
plt.xlabel("Sample Index")
plt.ylabel("Label")
plt.legend(title="Label")
plt.show()

# Custom Dataset Class for PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Prepare data
texts = data['text'].values
labels = data['label'].values
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop (1 epoch for demonstration; increase for better results)
model.train()
for epoch in range(1):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} completed")

# Evaluation
model.eval()
predictions = []
true_labels = []
RoBERTa_probs = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()  # Probability for positive class
        predictions.extend(preds)
        RoBERTa_probs.extend(probabilities)
        true_labels.extend(labels.cpu().numpy())

# Classification Report
print("\nRoBERTa Classification Report:")
print(classification_report(true_labels, predictions))

# Confusion Matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-AI', 'AI'], yticklabels=['Non-AI', 'AI'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Plot ROC Curve
plot_roc_curve(true_labels, RoBERTa_probs, "RoBERTa")
