In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load all data

In [None]:
import pickle
import pandas as pd

train_path = "drive/MyDrive/subtaskA_train_monolingual.jsonl"
dev_path = "drive/MyDrive/subtaskA_dev_monolingual.jsonl"

train_embeds_path = "drive/MyDrive/embeddings512.pkl"
dev_embeds_path = "drive/MyDrive/embeddings512_dev.pkl"

with open(train_path, 'r') as f:
    train_data = pd.read_json(f, lines=True)

with open(train_embeds_path, 'rb') as f:
    train_embeds = pickle.load(f)

with open(dev_path, 'r') as f:
    dev_data = pd.read_json(f, lines=True)

with open(dev_embeds_path, 'rb') as f:
    dev_embeds = pickle.load(f)


In [None]:
train_data

Unnamed: 0,text,label,model,source,id
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4
...,...,...,...,...,...
119752,"The paper is an interesting contribution, prim...",0,human,peerread,119752
119753,\nWe thank the reviewers for all their comment...,0,human,peerread,119753
119754,The authors introduce a semi-supervised method...,0,human,peerread,119754
119755,This paper proposes the Neural Graph Machine t...,0,human,peerread,119755


In [None]:
# sanity check
len(labels), len(inputs)

NameError: ignored

## Define model and training loop

In [None]:
import torch
import torch.nn as nn
from torch.nn.functional import relu

class neuralnet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.hidden_layer = nn.Linear(input_size, output_size)
        self.double()
    def forward(self,embeds):
        return torch.sigmoid(self.hidden_layer(embeds))




In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def training(
        train_inputs,
        train_labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion,
        epochs = 20,
        batch_size = 20
):
    data = list(zip(train_inputs,train_labels))
    for epoch in range(epochs):
        random.shuffle(data)
        batches = []
        running_loss = 0.0
        for i in range(0,len(data),batch_size):
            batches.append(data[i:i+batch_size])
        print(len(data),len(batches))

        for batch in tqdm(batches):
            inputs, labels = zip(*batch)
            inputs = torch.tensor(inputs) #, dtype = torch.float32
            labels = torch.tensor(labels)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'[{epoch + 1}] loss: {running_loss / len(batches):.3f}')
    predictions = model(torch.tensor(dev_inputs)).tolist()
    predictions = list(map(np.argmax, predictions))

    accuracy = accuracy_score(predictions, dev_labels)
    report = classification_report(predictions, dev_labels)
    print(f"Accuracy on development data: {accuracy}")
    print(report)

Neural network with just embeddings

In [None]:
import random


epochs = 20
batch_size = 20
inputs = train_embeds
labels = list(train_data["label"])
dev_inputs = dev_embeds
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(inputs[0]),2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)

training(
        inputs,
        labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion,
        epochs = epochs,
        batch_size = batch_size
)

119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 379.37it/s]


[1] loss: 0.473
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 406.31it/s]


[2] loss: 0.436
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 407.20it/s]


[3] loss: 0.426
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 406.45it/s]


[4] loss: 0.421
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 409.32it/s]


[5] loss: 0.417
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 412.13it/s]


[6] loss: 0.415
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 411.83it/s]


[7] loss: 0.412
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 409.17it/s]


[8] loss: 0.411
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 408.80it/s]


[9] loss: 0.409
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 407.35it/s]


[10] loss: 0.408
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 409.94it/s]


[11] loss: 0.407
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 407.45it/s]


[12] loss: 0.406
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 413.44it/s]


[13] loss: 0.405
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 410.39it/s]


[14] loss: 0.404
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 412.05it/s]


[15] loss: 0.404
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 411.48it/s]


[16] loss: 0.403
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 409.58it/s]


[17] loss: 0.403
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 409.52it/s]


[18] loss: 0.402
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 408.46it/s]


[19] loss: 0.401
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 406.42it/s]


[20] loss: 0.401
Accuracy on development data: 0.6486
              precision    recall  f1-score   support

           0       0.92      0.60      0.72      3877
           1       0.37      0.83      0.52      1123

    accuracy                           0.65      5000
   macro avg       0.65      0.71      0.62      5000
weighted avg       0.80      0.65      0.68      5000



Creating structural features

In [None]:
def count_special_characters(text):
    return sum(not ch.isalnum() and not ch.isspace() for ch in text)

train_data['Special_Characters_Count'] = train_data['text'].apply(count_special_characters)
dev_data['Special_Characters_Count'] = dev_data['text'].apply(count_special_characters)

train_data['Text_Length'] = train_data['text'].apply(len)
dev_data['Text_Length'] = dev_data['text'].apply(len)


train_data['Special_Characters_Ratio'] = train_data['Special_Characters_Count'] / train_data['Text_Length']
dev_data['Special_Characters_Ratio'] = dev_data['Special_Characters_Count'] / dev_data['Text_Length']


In [None]:
train_data

Unnamed: 0,text,label,model,source,id,Special_Characters_Count,Text_Length,Special_Characters_Ratio
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0,82,2244,0.036542
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1,132,3728,0.035408
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2,157,5237,0.029979
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3,152,4729,0.032142
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4,89,3095,0.028756
...,...,...,...,...,...,...,...,...
119752,"The paper is an interesting contribution, prim...",0,human,peerread,119752,14,440,0.031818
119753,\nWe thank the reviewers for all their comment...,0,human,peerread,119753,273,4837,0.056440
119754,The authors introduce a semi-supervised method...,0,human,peerread,119754,37,1109,0.033363
119755,This paper proposes the Neural Graph Machine t...,0,human,peerread,119755,25,1176,0.021259


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
def append_features(original_input, new_feature):
    new_feature_list = []
    scaler = StandardScaler()
    new_feature_normalized = scaler.fit_transform(np.array(new_feature).reshape(-1,1))
    new_feature_normalized = list(new_feature_normalized.flatten())
    for original, new in zip(original_input,new_feature_normalized):
        new_feature_list.append(original + [new])
    return new_feature_list

embedding+special character count model



In [None]:
inputs = append_features(train_embeds, list(train_data["Special_Characters_Count"] ))
labels = list(train_data["label"])
dev_inputs = append_features(dev_embeds,list(dev_data["Special_Characters_Count"] ))
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(inputs[0]),2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.007, momentum=0.9)

training(
        inputs,
        labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion,
)

119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 402.60it/s]


[1] loss: 0.595
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 404.31it/s]


[2] loss: 0.462
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 393.60it/s]


[3] loss: 0.440
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 410.21it/s]


[4] loss: 0.433
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 399.62it/s]


[5] loss: 0.424
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 397.50it/s]


[6] loss: 0.421
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 393.91it/s]


[7] loss: 0.420
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 396.69it/s]


[8] loss: 0.418
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 402.81it/s]


[9] loss: 0.416
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 394.78it/s]


[10] loss: 0.414
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 390.58it/s]


[11] loss: 0.414
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 393.33it/s]


[12] loss: 0.412
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 407.98it/s]


[13] loss: 0.411
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 405.94it/s]


[14] loss: 0.410
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 402.93it/s]


[15] loss: 0.409
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 407.33it/s]


[16] loss: 0.408
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 402.86it/s]


[17] loss: 0.408
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 404.86it/s]


[18] loss: 0.408
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 385.09it/s]


[19] loss: 0.408
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 405.90it/s]


[20] loss: 0.408
Accuracy on development data: 0.6548
              precision    recall  f1-score   support

           0       0.95      0.60      0.73      3966
           1       0.36      0.87      0.51      1034

    accuracy                           0.65      5000
   macro avg       0.65      0.74      0.62      5000
weighted avg       0.83      0.65      0.69      5000



sum embedding + sp. cc

In [None]:
train_embeds_sum = [ [sum(x)] for x in train_embeds]
inputs = append_features(train_embeds_sum, list(train_data["Special_Characters_Count"] ))
labels = list(train_data["label"])

dev_embeds_sum = [ [sum(x)] for x in dev_embeds]
dev_inputs = append_features(dev_embeds_sum,list(dev_data["Special_Characters_Count"] ))
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(inputs[0]),2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.007, momentum=0.9)

training(
        inputs,
        labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion,
)

119757 5988


100%|██████████| 5988/5988 [00:03<00:00, 1834.21it/s]


[1] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2091.41it/s]


[2] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:03<00:00, 1962.43it/s]


[3] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2293.84it/s]


[4] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2243.42it/s]


[5] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2180.82it/s]


[6] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:03<00:00, 1875.34it/s]


[7] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2300.95it/s]


[8] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2265.49it/s]


[9] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2263.00it/s]


[10] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2101.30it/s]


[11] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:03<00:00, 1922.30it/s]


[12] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2269.86it/s]


[13] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2257.24it/s]


[14] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2286.53it/s]


[15] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2037.66it/s]


[16] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:03<00:00, 1986.76it/s]


[17] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2246.78it/s]


[18] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2292.24it/s]


[19] loss: 0.693
119757 5988


100%|██████████| 5988/5988 [00:02<00:00, 2259.71it/s]


[20] loss: 0.693
Accuracy on development data: 0.5
              precision    recall  f1-score   support

           0       1.00      0.50      0.67      5000
           1       0.00      0.00      0.00         0

    accuracy                           0.50      5000
   macro avg       0.50      0.25      0.33      5000
weighted avg       1.00      0.50      0.67      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


embeds + scc + text length (normalized)

In [None]:
inputs = append_features(train_embeds, list(train_data["Special_Characters_Count"] ))
inputs = append_features(inputs, list(train_data["Text_Length"] ))

labels = list(train_data["label"])
dev_inputs = append_features(dev_embeds,list(dev_data["Special_Characters_Count"] ))
dev_inputs = append_features(dev_inputs, list(dev_data["Text_Length"] ))
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(inputs[0]),2)
print(len(inputs[0]))
optimizer = torch.optim.SGD(model.parameters(), lr=0.007, momentum=0.9)

training(
        inputs,
        labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion
)

770
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 386.80it/s]


[1] loss: 0.447
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 401.34it/s]


[2] loss: 0.417
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 397.44it/s]


[3] loss: 0.410
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 398.52it/s]


[4] loss: 0.406
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 395.28it/s]


[5] loss: 0.404
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 399.47it/s]


[6] loss: 0.402
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 402.31it/s]


[7] loss: 0.400
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 403.02it/s]


[8] loss: 0.399
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 396.51it/s]


[9] loss: 0.398
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 397.76it/s]


[10] loss: 0.397
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 394.66it/s]


[11] loss: 0.396
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 386.81it/s]


[12] loss: 0.395
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 384.53it/s]


[13] loss: 0.395
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 392.62it/s]


[14] loss: 0.394
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 389.12it/s]


[15] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 401.93it/s]


[16] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 398.27it/s]


[17] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 397.78it/s]


[18] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 397.89it/s]


[19] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 394.91it/s]


[20] loss: 0.391
Accuracy on development data: 0.6666
              precision    recall  f1-score   support

           0       0.94      0.61      0.74      3875
           1       0.39      0.87      0.54      1125

    accuracy                           0.67      5000
   macro avg       0.67      0.74      0.64      5000
weighted avg       0.82      0.67      0.69      5000



Adding the log losses

In [None]:
file_path = "drive/MyDrive/final-logLikelihoods-t5-base.pkl"
with open(file_path, 'rb') as f:
    log_likelihood = pickle.load(f)

119757

In [None]:
train_data['Log_Likelihood_t5'] = log_likelihood[1]
dev_data['Log_Likelihood_t5'] =  log_likelihood[0]

train_data

Unnamed: 0,text,label,model,source,id,Special_Characters_Count,Text_Length,Special_Characters_Ratio,Log_Likelihood_t5
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0,82,2244,0.036542,-6.593756
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1,132,3728,0.035408,-2.994959
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2,157,5237,0.029979,-2.987468
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3,152,4729,0.032142,-3.862272
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4,89,3095,0.028756,-4.151425
...,...,...,...,...,...,...,...,...,...
119752,"The paper is an interesting contribution, prim...",0,human,peerread,119752,14,440,0.031818,-5.399578
119753,\nWe thank the reviewers for all their comment...,0,human,peerread,119753,273,4837,0.056440,-8.677832
119754,The authors introduce a semi-supervised method...,0,human,peerread,119754,37,1109,0.033363,-6.234921
119755,This paper proposes the Neural Graph Machine t...,0,human,peerread,119755,25,1176,0.021259,-1.387176


In [None]:
inputs = append_features(train_embeds, list(train_data["Special_Characters_Count"] ))
inputs = append_features(inputs, list(train_data["Text_Length"] ))
inputs = append_features(inputs, list(train_data["Log_Likelihood_t5"] ))

labels = list(train_data["label"])
dev_inputs = append_features(dev_embeds,list(dev_data["Special_Characters_Count"] ))
dev_inputs = append_features(dev_inputs, list(dev_data["Text_Length"] ))
dev_inputs = append_features(dev_inputs, list(dev_data["Log_Likelihood_t5"] ))
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(inputs[0]),2)
print(len(inputs[0]))
optimizer = torch.optim.SGD(model.parameters(), lr=0.007, momentum=0.9)

training(
        inputs,
        labels,
        dev_inputs,
        dev_labels,
        model,
        optimizer,
        criterion
)

771
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 387.73it/s]


[1] loss: 0.447
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 396.46it/s]


[2] loss: 0.417
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 400.67it/s]


[3] loss: 0.410
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 400.17it/s]


[4] loss: 0.406
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 403.30it/s]


[5] loss: 0.404
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 385.31it/s]


[6] loss: 0.402
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 382.49it/s]


[7] loss: 0.400
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 370.70it/s]


[8] loss: 0.399
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 398.29it/s]


[9] loss: 0.398
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 399.72it/s]


[10] loss: 0.397
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 400.42it/s]


[11] loss: 0.396
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 401.89it/s]


[12] loss: 0.395
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 400.33it/s]


[13] loss: 0.394
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 396.64it/s]


[14] loss: 0.394
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 398.92it/s]


[15] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 399.86it/s]


[16] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 396.67it/s]


[17] loss: 0.393
119757 5988


100%|██████████| 5988/5988 [00:15<00:00, 398.85it/s]


[18] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 401.03it/s]


[19] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:14<00:00, 400.00it/s]


[20] loss: 0.391
Accuracy on development data: 0.6492
              precision    recall  f1-score   support

           0       0.95      0.59      0.73      3998
           1       0.35      0.87      0.50      1002

    accuracy                           0.65      5000
   macro avg       0.65      0.73      0.61      5000
weighted avg       0.83      0.65      0.68      5000

