In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, texts, labels, max_length=200):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.encodings = self.tokenizer(texts.tolist(), truncation=True, 
                                      padding='max_length', max_length=max_length)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)



In [2]:
class BertTrainer:
    def __init__(self, num_labels=2, learning_rate=5e-5):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=num_labels).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)

    def train(self, train_loader, val_loader, num_epochs=3, save_path='best_model'):
        num_training_steps = num_epochs * len(train_loader)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        best_val_loss = float('inf')
        patience_counter = 0
        patience_limit = 2

        for epoch in range(num_epochs):
            # Training
            self.model.train()
            total_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
            
            for batch in progress_bar:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                self.optimizer.zero_grad()
                outputs = self.model(**batch)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                lr_scheduler.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

            # Validation
            val_loss = self.evaluate(val_loader)
            print(f"Validation Loss: {val_loss:.4f}")

            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                self.model.save_pretrained(save_path)  # Save to the customized path
            else:
                patience_counter += 1
                if patience_counter >= patience_limit:
                    print("Early stopping triggered.")
                    break


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, texts, labels, max_length=200):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.encodings = self.tokenizer(texts.tolist(), truncation=True, 
                                      padding='max_length', max_length=max_length)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

class BertTrainer:
    def __init__(self, num_labels=2, learning_rate=5e-5):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=num_labels).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)

    def train(self, train_loader, val_loader, num_epochs=3, save_path='best_model'):
        num_training_steps = num_epochs * len(train_loader)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        best_val_loss = float('inf')
        patience_counter = 0
        patience_limit = 2

        for epoch in range(num_epochs):
            # Training
            print(f"Training epoch {epoch+1}...")  # Debugging line
            self.model.train()
            total_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
            
            for batch in progress_bar:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                self.optimizer.zero_grad()
                outputs = self.model(**batch)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                lr_scheduler.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

            # Validation
            val_loss = self.evaluate(val_loader)
            print(f"Validation Loss: {val_loss:.4f}")

            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                self.model.save_pretrained(save_path)
            else:
                patience_counter += 1
                if patience_counter >= patience_limit:
                    print("Early stopping triggered.")
                    break

    def evaluate(self, loader):
        self.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            for batch in loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                total_loss += outputs.loss.item()

        return total_loss / len(loader)

    def test(self, test_loader):
        self.model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                predictions = torch.argmax(outputs.logits, dim=-1)
                correct += (predictions == batch['labels']).sum().item()
                total += len(batch['labels'])

        accuracy = (correct / total) * 100
        return accuracy

def load_data(file_path, batch_size=16):
    # Load and prepare data
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with {len(df)} samples.")  # Debugging line
    category = {}
    for i in range(len(df)):
        chatgpt = df.iloc[i]["paraphrases"][1:-1].split(', ')
        for j in chatgpt[:1]:
            category[j[1:-1]] = 'chatgpt'
        category[df.iloc[i]['text']] = "human"
    
    data = pd.DataFrame(category.items(), columns=["text", "category"])
    data = data.sample(frac=1)[:10000]  # Limit to 10000 samples
    print(f"Processed dataset with {len(data)} samples.")  # Debugging line
    
    # Convert categories to integers
    label_mapping = {'human': 0, 'chatgpt': 1}
    data['category'] = data['category'].map(label_mapping)
    
    # Split data
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        data['text'], data['category'], test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25)

    # Create datasets
    train_dataset = TextDataset(X_train, y_train)
    val_dataset = TextDataset(X_val, y_val)
    test_dataset = TextDataset(X_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def main():
    # Load data
    train_loader, val_loader, test_loader = load_data('Z:/bert/chatgpt_paraphrases.csv')

    # Initialize trainer
    trainer = BertTrainer()

    # Define a custom path for saving the model
    custom_save_path = "Z:/bert/MODEL WE" # Customize this path

    # Train model
    trainer.train(train_loader, val_loader, save_path=custom_save_path)

    # Test model
    accuracy = trainer.test(test_loader)
    print(f"Test Accuracy: {accuracy:.2f}%")

if __name__ == "__main__":
    main()


Loaded dataset with 419197 samples.
Processed dataset with 10000 samples.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch 1...


Epoch 1/3: 100%|██████████| 375/375 [04:06<00:00,  1.52it/s]


Epoch 1, Average Loss: 0.3843
Validation Loss: 0.3010
Training epoch 2...


Epoch 2/3: 100%|██████████| 375/375 [04:23<00:00,  1.43it/s]


Epoch 2, Average Loss: 0.1827
Validation Loss: 0.3205
Training epoch 3...


Epoch 3/3: 100%|██████████| 375/375 [04:23<00:00,  1.42it/s]


Epoch 3, Average Loss: 0.0710
Validation Loss: 0.3898
Early stopping triggered.
Test Accuracy: 87.10%
