In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

class TextDataset(Dataset):
    def __init__(self, texts, labels, max_length=200):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.encodings = self.tokenizer(texts.tolist(), truncation=True, 
                                        padding='max_length', max_length=max_length)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

class BertTrainer:
    def __init__(self, num_labels=2, learning_rate=5e-5):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=num_labels).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01)
        self.scaler = GradScaler()

        # Print device info for debugging
        print(f"Using device: {self.device}")

    def train(self, train_loader, val_loader, num_epochs=3, save_path='best_model'):
        num_training_steps = num_epochs * len(train_loader)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        best_val_loss = float('inf')
        patience_counter = 0
        patience_limit = 2

        for epoch in range(num_epochs):
            print(f"Training epoch {epoch + 1}...")
            self.model.train()
            total_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")

            for batch in progress_bar:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                self.optimizer.zero_grad()

                with autocast():
                    outputs = self.model(**batch)
                    loss = outputs.loss

                self.scaler.scale(loss).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                lr_scheduler.step()

                total_loss += loss.item()
                torch.cuda.empty_cache()

            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

            val_loss = self.evaluate(val_loader)
            print(f"Validation Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                self.model.save_pretrained(save_path)
            else:
                patience_counter += 1
                if patience_counter >= patience_limit:
                    print("Early stopping triggered.")
                    break

    def evaluate(self, loader):
        self.model.eval()
        total_loss = 0

        with torch.no_grad():
            for batch in loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                total_loss += outputs.loss.item()

        return total_loss / len(loader)

    def test(self, test_loader):
        self.model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                predictions = torch.argmax(outputs.logits, dim=-1)
                correct += (predictions == batch['labels']).sum().item()
                total += len(batch['labels'])

        accuracy = (correct / total) * 100
        return accuracy

def load_data(file_path, batch_size=16, limit_samples=10000):
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with {len(df)} samples.")
    
    # Limit to 10,000 samples
    if len(df) > limit_samples:
        df = df.sample(n=limit_samples, random_state=42)

    category = {}
    for i in range(len(df)):
        paraphrases = df.iloc[i]["paraphrases"][1:-1].split(', ')
        for paraphrase in paraphrases[:1]:
            category[paraphrase[1:-1]] = 'chatgpt'
        category[df.iloc[i]['text']] = "human"

    data = pd.DataFrame(category.items(), columns=["text", "category"]).sample(frac=1)
    print(f"Processed dataset with {len(data)} samples.")

    label_mapping = {'human': 0, 'chatgpt': 1}
    data['category'] = data['category'].map(label_mapping)

    X_train_val, X_test, y_train_val, y_test = train_test_split(data['text'], data['category'], test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

    train_dataset = TextDataset(X_train, y_train)
    val_dataset = TextDataset(X_val, y_val)
    test_dataset = TextDataset(X_test, y_test)

    # Reduce batch size if GPU memory is low
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, 
                              num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, 
                            num_workers=0, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=8, 
                             num_workers=0, pin_memory=True)

    return train_loader, val_loader, test_loader

def main():
    file_path = 'Z:/bert/chatgpt_paraphrases.csv'
    model_save_path = "Z:/bert/modeltensorss"
    train_loader, val_loader, test_loader = load_data(file_path)

    trainer = BertTrainer()
    trainer.train(train_loader, val_loader, save_path=model_save_path)
    accuracy = trainer.test(test_loader)
    print(f"Test Accuracy: {accuracy:.2f}%")

if __name__ == "__main__":
    main()


Loaded dataset with 419197 samples.
Processed dataset with 19850 samples.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = GradScaler()


Using device: cuda
Training epoch 1...


  with autocast():
Epoch 1/3: 100%|██████████| 1489/1489 [05:08<00:00,  4.83it/s]


Epoch 1, Average Loss: 0.3724
Validation Loss: 0.2970
Training epoch 2...


Epoch 2/3: 100%|██████████| 1489/1489 [05:14<00:00,  4.73it/s]


Epoch 2, Average Loss: 0.2124
Validation Loss: 0.3054
Training epoch 3...


Epoch 3/3: 100%|██████████| 1489/1489 [05:10<00:00,  4.80it/s]


Epoch 3, Average Loss: 0.0851
Validation Loss: 0.4217
Early stopping triggered.
Test Accuracy: 86.75%
