In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

ChatGPT = []
OpenWeb = []

# Read all the jsonl files in the path
for i in range(8):
    a = pd.read_json(f"dataset/opengpttext-clean/chatgpt/urlsf_subset0{i}.jsonl", lines=True)
    b = pd.read_json(f"dataset/opengpttext-clean/openweb/urlsf_subset0{i}.jsonl", lines=True)
    ChatGPT.append(a)
    OpenWeb.append(b)

# Combine dataframes of the same type
chatgpt_df = pd.concat(ChatGPT, ignore_index=True)
openweb_df = pd.concat(OpenWeb, ignore_index=True)

# Assign labels
chatgpt_df['label'] = 1
openweb_df['label'] = 0

# Combine both labeled dataframes
combined_df = pd.concat([chatgpt_df, openweb_df], ignore_index=True)
combined_df.head()

Unnamed: 0,uid,text,label
0,[urlsf_subset00]-[83],The National Weather Service's Mike McFarland ...,1
1,[urlsf_subset00]-[89],The President of the United States was seen on...,1
2,[urlsf_subset00]-[390],Enner Valencia scored two goals in Ecuador's 2...,1
3,[urlsf_subset00]-[457],"Beginning with the introduction, the author sh...",1
4,[urlsf_subset00]-[458],Mexico has implemented its newest data retenti...,1


In [8]:
import json
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch


# Tokenize data
texts = combined_df['text'].tolist()
labels = combined_df['label'].tolist()

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
encoding = tokenizer(texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

# Split data into train, validation, and test sets
train_input_ids, test_input_ids, train_labels, test_labels = train_test_split(encoding['input_ids'], labels, test_size=0.2, random_state=42)
train_attention_mask, test_attention_mask = train_test_split(encoding['attention_mask'], test_size=0.2, random_state=42)

train_input_ids, val_input_ids, train_labels, val_labels = train_test_split(train_input_ids, train_labels, test_size=0.125, random_state=42)
train_attention_mask, val_attention_mask = train_test_split(train_attention_mask, test_size=0.125, random_state=42)

train_dataset = TensorDataset(train_input_ids, train_attention_mask, torch.tensor(train_labels))
val_dataset = TensorDataset(val_input_ids, val_attention_mask, torch.tensor(val_labels))
test_dataset = TensorDataset(test_input_ids, test_attention_mask, torch.tensor(test_labels))


In [9]:
train_dataset[0]

(tensor([    0,   243,    16,  3159,    13, 23811,     7,  3271,  8109,    49,
          6160,    50,   904,    49,  5086,    15,    49,  4476,     6,    53,
          3533, 32194,   222,    95,    14,    71,  9585,   154,     5,  1573,
           166,   808,   397,  1954,     4,  1190,  6760,  3979,  1032,    23,
          5062,    15,   274,   791,  3721,   204,     4,    91,  2641,    14,
            37,   905,     5,  1032,   213,    15,   350,   251,     6,  3735,
          6760,  3979,     7,   185,   350,   171, 10495, 19250,     4,   152,
          7988,    16,    10, 17846,   517,    25, 23811,    32,   747,  5888,
            53,  7154,   547,  8943,     4, 50118, 14229,     5,  1032,     6,
           166,   808,   397,  1882,  6760,  3979,    19,    10,  2934, 13789,
          2506,     8,  1143,     7, 10064,   123,    15,     5,  7821,    13,
            59,   799,  2397,     4, 32194,   115,    33,  2294,     5,  1032,
           656,     7,  2097,  6760,  3979,    31,  

In [None]:
from transformers import RobertaModel
import torch.nn as nn

class RobertaSentinel(nn.Module):
    def __init__(self):
        super(RobertaSentinel, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.fc1 = nn.Linear(self.roberta.config.hidden_size, 256)
        self.fc2 = nn.Linear(256, 2)
        self.gelu = nn.GELU()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]
        x = self.gelu(self.fc1(cls_token))
        x = self.fc2(x)
        return x

# Freeze the RoBERTa layers
model = RobertaSentinel()
for param in model.roberta.parameters():
    param.requires_grad = False

In [None]:
from transformers import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_dataset)//512, eta_min=0)
loss_fn = CrossEntropyLoss()

# Training loop with gradient accumulation
epochs = 15
accumulation_steps = 4
model.train()

for epoch in range(epochs):
    running_loss = 0.0
    for i, (input_ids, attention_mask, labels) in enumerate(DataLoader(train_dataset, batch_size=512, shuffle=True)):
        optimizer.zero_grad()
        outputs = model(input_ids.to(device), attention_mask.to(device))
        loss = loss_fn(outputs, labels.to(device))
        loss.backward()

        if (i+1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()
            optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_dataset)}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in DataLoader(test_dataset, batch_size=512):
        outputs = model(input_ids.to(device), attention_mask.to(device))
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels.to(device)).sum().item()

print(f"Accuracy: {correct/total*100:.2f}%")