In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load all data

In [None]:
import pickle
import pandas as pd

train_path = "drive/MyDrive/subtaskA_train_monolingual.jsonl"
dev_path = "drive/MyDrive/subtaskA_dev_monolingual.jsonl"

train_embeds_path = "drive/MyDrive/embeddings512.pkl"
dev_embeds_path = "drive/MyDrive/embeddings512_dev.pkl"

file_path = "drive/MyDrive/final-logLikelihoods-t5-base.pkl"
with open(file_path, 'rb') as f:
    log_likelihood = pickle.load(f)

with open(train_path, 'r') as f:
    train_data = pd.read_json(f, lines=True)

with open(train_embeds_path, 'rb') as f:
    train_embeds = pickle.load(f)

with open(dev_path, 'r') as f:
    dev_data = pd.read_json(f, lines=True)

with open(dev_embeds_path, 'rb') as f:
    dev_embeds = pickle.load(f)


In [None]:
def count_special_characters(text):
    return sum(not ch.isalnum() and not ch.isspace() for ch in text)

train_data['Special_Characters_Count'] = train_data['text'].apply(count_special_characters)
dev_data['Special_Characters_Count'] = dev_data['text'].apply(count_special_characters)

train_data['Text_Length'] = train_data['text'].apply(len)
dev_data['Text_Length'] = dev_data['text'].apply(len)


train_data['Special_Characters_Ratio'] = train_data['Special_Characters_Count'] / train_data['Text_Length']
dev_data['Special_Characters_Ratio'] = dev_data['Special_Characters_Count'] / dev_data['Text_Length']

train_data['Log_Likelihood_t5'] = log_likelihood[1]
dev_data['Log_Likelihood_t5'] =  log_likelihood[0]

train_data

Unnamed: 0,text,label,model,source,id,Special_Characters_Count,Text_Length,Special_Characters_Ratio,Log_Likelihood_t5
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0,82,2244,0.036542,-6.593756
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1,132,3728,0.035408,-2.994959
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2,157,5237,0.029979,-2.987468
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3,152,4729,0.032142,-3.862272
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4,89,3095,0.028756,-4.151425
...,...,...,...,...,...,...,...,...,...
119752,"The paper is an interesting contribution, prim...",0,human,peerread,119752,14,440,0.031818,-5.399578
119753,\nWe thank the reviewers for all their comment...,0,human,peerread,119753,273,4837,0.056440,-8.677832
119754,The authors introduce a semi-supervised method...,0,human,peerread,119754,37,1109,0.033363,-6.234921
119755,This paper proposes the Neural Graph Machine t...,0,human,peerread,119755,25,1176,0.021259,-1.387176


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
def create_struct_features(dataset):
    column_names = ["Special_Characters_Count",	"Text_Length", "Special_Characters_Ratio"]
    struct_feature_list = dataset[column_names]
    # print(struct_feature_list)
    scaler = StandardScaler()
    struct_feature_normalized = scaler.fit_transform(struct_feature_list)
    # print(struct_feature_normalized)
    return struct_feature_normalized

## Define model and training loop

In [None]:
import torch
import torch.nn as nn
from torch.nn.functional import relu

class neuralnet(nn.Module):
    def __init__(self, embeds_size, structural_inputs_size, num_classes):
        super().__init__()

        self.hidden_embedd = nn.Linear(embeds_size, 350)
        self.hidden_struct = nn.Linear(structural_inputs_size, 4* structural_inputs_size)
        self.hidden_combined = nn.Linear(350 + 4* structural_inputs_size, num_classes)
        # self.double()
    def forward(self,embeds, structurals):
        embed_output = self.hidden_embedd(embeds)
        struct_output = self.hidden_struct(structurals)
        # print(embed_output.shape, struct_output.shape)
        combined = self.hidden_combined(torch.cat((embed_output,struct_output), dim=1))
        return torch.sigmoid(combined)




In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def training(
        train_embeds,
        train_struct_features,
        train_labels,
        dev__embeds,
        dev_struct_features,
        dev_labels,
        model,
        optimizer,
        criterion,
        epochs = 20,
        batch_size = 20
):
    data = list(zip(train_embeds,train_struct_features,train_labels))
    for epoch in range(epochs):
        random.shuffle(data)
        batches = []
        running_loss = 0.0
        for i in range(0,len(data),batch_size):
            batches.append(data[i:i+batch_size])
        print(len(data),len(batches))

        for batch in tqdm(batches):
            embeds, structural_features, labels = zip(*batch)
            inputs1 = torch.tensor(embeds) #, dtype = torch.float32
            inputs2 = torch.tensor(structural_features, dtype = torch.float32)
            labels = torch.tensor(labels)

            optimizer.zero_grad()

            outputs = model(inputs1, inputs2)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'[{epoch + 1}] loss: {running_loss / len(batches):.3f}')
    predictions = model(torch.tensor(dev_embeds),torch.tensor(dev_struct_features,dtype = torch.float32)).tolist()
    predictions = list(map(np.argmax, predictions))

    accuracy = accuracy_score(predictions, dev_labels)
    report = classification_report(predictions, dev_labels)
    print(f"Accuracy on development data: {accuracy}")
    print(report)

Neural network with just embeddings

In [None]:
import random


epochs = 20
batch_size = 20

struct_features = create_struct_features(train_data)
train_inputs = [train_embeds, ]
train_labels = list(train_data["label"])
dev_struct_features = create_struct_features(dev_data)
dev_labels = dev_data["label"]
criterion = nn.CrossEntropyLoss()
model = neuralnet(len(train_embeds[0]), len(struct_features[0]), 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)

training(
        train_embeds,
        struct_features,
        train_labels,
        dev_embeds,
        dev_struct_features,
        dev_labels,
        model,
        optimizer,
        criterion,
        epochs = epochs,
        batch_size = batch_size
)

119757 5988


100%|██████████| 5988/5988 [00:21<00:00, 272.92it/s]


[1] loss: 0.441
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 287.11it/s]


[2] loss: 0.405
119757 5988


100%|██████████| 5988/5988 [00:21<00:00, 276.98it/s]


[3] loss: 0.398
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 292.91it/s]


[4] loss: 0.394
119757 5988


100%|██████████| 5988/5988 [00:21<00:00, 273.69it/s]


[5] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 290.21it/s]


[6] loss: 0.391
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 270.15it/s]


[7] loss: 0.390
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 287.53it/s]


[8] loss: 0.389
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 271.84it/s]


[9] loss: 0.388
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 294.35it/s]


[10] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:21<00:00, 274.51it/s]


[11] loss: 0.388
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 288.92it/s]


[12] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 270.63it/s]


[13] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 288.72it/s]


[14] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 271.44it/s]


[15] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 292.79it/s]


[16] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 271.54it/s]


[17] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 288.86it/s]


[18] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:22<00:00, 267.75it/s]


[19] loss: 0.385
119757 5988


100%|██████████| 5988/5988 [00:20<00:00, 295.25it/s]


[20] loss: 0.385
Accuracy on development data: 0.6798
              precision    recall  f1-score   support

           0       0.93      0.62      0.74      3767
           1       0.43      0.86      0.57      1233

    accuracy                           0.68      5000
   macro avg       0.68      0.74      0.66      5000
weighted avg       0.81      0.68      0.70      5000



In [None]:
class neuralnet2(nn.Module):
    def __init__(self, embeds_size, structural_inputs_size, num_classes):
        super().__init__()

        self.hidden_embedd = nn.Linear(embeds_size, 10)
        self.hidden_struct = nn.Linear(structural_inputs_size, 4* structural_inputs_size)
        self.hidden_combined = nn.Linear(10 + 4* structural_inputs_size, num_classes)
        # self.double()
    def forward(self,embeds, structurals):
        embed_output = self.hidden_embedd(embeds)
        struct_output = self.hidden_struct(structurals)
        # print(embed_output.shape, struct_output.shape)
        combined = self.hidden_combined(torch.cat((embed_output,struct_output), dim=1))
        return torch.sigmoid(combined)

In [None]:
criterion = nn.CrossEntropyLoss()
model = neuralnet2(len(train_embeds[0]), len(struct_features[0]), 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)

training(
        train_embeds,
        struct_features,
        train_labels,
        dev_embeds,
        dev_struct_features,
        dev_labels,
        model,
        optimizer,
        criterion,
        epochs = epochs,
        batch_size = batch_size
)

119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 357.79it/s]


[1] loss: 0.445
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 370.17it/s]


[2] loss: 0.404
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 360.16it/s]


[3] loss: 0.398
119757 5988


100%|██████████| 5988/5988 [00:17<00:00, 344.13it/s]


[4] loss: 0.394
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 368.48it/s]


[5] loss: 0.392
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 367.27it/s]


[6] loss: 0.391
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 369.88it/s]


[7] loss: 0.390
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 370.86it/s]


[8] loss: 0.389
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 354.36it/s]


[9] loss: 0.388
119757 5988


100%|██████████| 5988/5988 [00:17<00:00, 350.19it/s]


[10] loss: 0.388
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 367.41it/s]


[11] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 365.98it/s]


[12] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 356.18it/s]


[13] loss: 0.387
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 358.08it/s]


[14] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:17<00:00, 339.25it/s]


[15] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 366.73it/s]


[16] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 367.45it/s]


[17] loss: 0.386
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 367.62it/s]


[18] loss: 0.385
119757 5988


100%|██████████| 5988/5988 [00:16<00:00, 363.46it/s]


[19] loss: 0.385
119757 5988


100%|██████████| 5988/5988 [00:17<00:00, 337.16it/s]


[20] loss: 0.385
Accuracy on development data: 0.6566
              precision    recall  f1-score   support

           0       0.95      0.60      0.73      3979
           1       0.36      0.88      0.51      1021

    accuracy                           0.66      5000
   macro avg       0.66      0.74      0.62      5000
weighted avg       0.83      0.66      0.69      5000

