In [1]:
from pathlib import Path
import pandas as pd

In [2]:
PATH_TO_DATA = Path('../input/detecting-generated-scientific-papers/')

In [3]:
train_df = pd.read_csv(PATH_TO_DATA / "fake_papers_test_public_extended.csv", index_col="id")
test_df = pd.read_csv(PATH_TO_DATA / "fake_papers_train_part_public_extended.csv", index_col="id")

In [4]:
x_train = train_df.iloc[:,0]
y_train = train_df.iloc[:,2]

In [5]:
x_test = test_df.iloc[:,0]
y_test = test_df.iloc[:,2]

In [6]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

# Set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the necessary libraries
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
y_train = list(y_train)
y_test = list(y_test)

In [8]:

# Assuming x_train, y_train, x_test, y_test are lists containing textual data
# Convert the text data to input tensors using the tokenizer
encoded_train = tokenizer.batch_encode_plus(x_train, padding=True, truncation=True, max_length=512, return_tensors='pt')
encoded_test = tokenizer.batch_encode_plus(x_test, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Create TensorDatasets for training and testing
train_dataset = TensorDataset(encoded_train['input_ids'],  encoded_train['attention_mask'], torch.tensor(y_train),)
test_dataset = TensorDataset(encoded_test['input_ids'], encoded_test['attention_mask'],torch.tensor(y_test))


In [9]:
# Define the mini BERT model
class MiniBERT(nn.Module):
    def __init__(self, num_classes):
        super(MiniBERT, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

# Set the hyperparameters
num_classes = 2
batch_size = 16
learning_rate = 2e-5
num_epochs = 5

# Create the model instance
model = MiniBERT(num_classes).to(device)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [10]:

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Create data loaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [11]:
from sklearn.metrics import f1_score
# Training loop
for epoch in range(num_epochs):
    model.train()
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        all_labels = []
        all_preds = []
        for input_ids, attention_mask, labels in test_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            _, predicted = torch.max(logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

        f1 = f1_score(all_labels, all_preds)
        accuracy = correct / total
        
        print(f"Epoch {epoch+1} | Accuracy: {accuracy}")
        print(f"Epoch {epoch+1} | F1 Score: {f1:.4f}")

# Save the trained model
torch.save(model.state_dict(), "mini_bert_model.pth")

Epoch 1 | Accuracy: 0.9813084112149533
Epoch 1 | F1 Score: 0.9865
Epoch 2 | Accuracy: 0.9874766355140187
Epoch 2 | F1 Score: 0.9908
Epoch 3 | Accuracy: 0.9809345794392523
Epoch 3 | F1 Score: 0.9862
Epoch 4 | Accuracy: 0.9685981308411215
Epoch 4 | F1 Score: 0.9774
Epoch 5 | Accuracy: 0.9880373831775701
Epoch 5 | F1 Score: 0.9912
