In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load poetry datasets
bird_poetry = pd.read_csv('/content/drive/MyDrive/data/BARD/BARD_poetry.csv')  # columns: index, prompts, BARD
gpt_poetry = pd.read_csv('/content/drive/MyDrive/data/GPT/ChatGPT_poetry.csv')    # columns: prompts, responses
human_poetry = pd.read_csv('/content/drive/MyDrive/data/Human/human_poetry.csv') # columns: index, Title, Poem, Poet, Tags

# Prepare the texts and add source labels
bird_poetry['text'] = bird_poetry['BARD']  # Adjust the column name accordingly
gpt_poetry['text'] = gpt_poetry['responses']  # Adjust the column name accordingly
human_poetry['text'] = human_poetry['Poem'] + ' ' + human_poetry['Poet'] + ' ' + human_poetry['Tags']

# Add source labels
bird_poetry['source'] = 1  # BARD
gpt_poetry['source'] = 0    # GPT
human_poetry['source'] = 2   # Human

human_poetry = human_poetry.head(1000)

# Combine datasets
poetry_data = pd.concat([bird_poetry[['text', 'source']], gpt_poetry[['text', 'source']], human_poetry[['text', 'source']]], ignore_index=True)

# Clean the text
poetry_data['text'] = (
    poetry_data['text'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)  # Remove URLs
    .str.replace(r'<.*?>', '', case=False)  # Remove HTML tags
    .str.replace(r'\s+', ' ', regex=True)  # Remove extra whitespaces
    .str.lower()  # Convert to lowercase
)

# Handle NaN values: replace NaN with an empty string
poetry_data['text'] = poetry_data['text'].fillna('')

# Prepare features and labels
X = poetry_data['text']
y = poetry_data['source']

# Tokenization with DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_texts, test_texts, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)  # Reduced sequence length to 256
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=256)

# Define dataset class
class PoetryDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = PoetryDataset(train_encodings, y_train.values)
test_dataset = PoetryDataset(test_encodings, y_test.values)

# Load pre-trained DistilBERT model for classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Set up DataLoader with a smaller batch size
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Reduced batch size to 4
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Mixed precision setup
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop with gradient accumulation and mixed precision
accumulation_steps = 4  # Accumulate gradients over 4 steps

model.train()
for epoch in range(3):  # You can increase the number of epochs for better performance
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        with autocast():
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss / accumulation_steps  # Normalize the loss by accumulation steps
            scaler.scale(loss).backward()

        # Perform optimization step every `accumulation_steps` batches
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

# Evaluation
model.eval()
y_pred = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        y_pred.extend(predictions.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(classification_report(y_test, y_pred, target_names=['GPT', 'BARD', 'Human']))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Accuracy: 0.9967
Precision: 0.9967
Recall: 0.9967
F1-score: 0.9967
              precision    recall  f1-score   support

         GPT       1.00      0.98      0.99        50
        BARD       1.00      1.00      1.00        50
       Human       1.00      1.00      1.00       200

    accuracy                           1.00       300
   macro avg       1.00      0.99      1.00       300
weighted avg       1.00      1.00      1.00       300

