In [1]:
from transformers import XLNetTokenizer , TFXLNetModel
from tqdm import tqdm
import pandas as pd 
import numpy as np
import random
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Attention, Input, Dense, LSTM, Embedding, Bidirectional, Dropout, TimeDistributed, concatenate, MaxPooling1D, Activation, Add, Flatten, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
import os
import datetime

2023-12-26 17:02:15.758801: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_big = pd.read_csv("Datasets/train_v2_drcat_02.csv", sep=',')
org_train = pd.read_csv('Datasets/train_essays.csv')
train_small = pd.read_csv('Datasets/argugpt.csv')

In [3]:
select = []
for text in train_small.text.values:
    if(len(text.split()) > 100):
        select.append(1)
    else:
        select.append(0)

In [4]:
select.count(1)

3896

In [5]:
train_small_1 = train_small.copy()
train_small_1['select'] = select

In [6]:
dataset_small_1 = train_small_1[train_small_1['select'] == 1].copy()
dataset_small_1['label'] = 1
dataset_small_1 = dataset_small_1[['text','label']]

del train_small_1

In [7]:
select = []
for text in train_big[train_big['label']==1].text.values:
    if(len(text.split()) > 100):
        select.append(1)
    else:
        select.append(0)

In [8]:
select.count(1)

17417

In [9]:
train_big_1 = train_big[train_big['label']==1].copy()
train_big_1['select'] = select

In [10]:
dataset_big_1 = train_big_1[train_big_1['select'] == 1].sample(n=9100,random_state=42)
dataset_big_1 = dataset_big_1[['text','label']]
del train_big_1

In [11]:
dataset_1 = pd.concat([dataset_big_1,dataset_small_1])
dataset_1.reset_index(drop=True,inplace=True)
del dataset_big_1
del dataset_small_1

In [12]:
dataset_1.rename(columns={'label':'generated'})

Unnamed: 0,text,generated
0,When it comes to having someone attempt to mak...,1
1,"Dear State Senator, \n\nI am writing to you ...",1
2,"Hey, Mrs. Johnson! Here's my essay on the cons...",1
3,"In recent years, there has been a growing move...",1
4,"Dear Senator,\n\nI am writing to you today to ...",1
...,...,...
12991,The notion that one must be forced to defend a...,1
12992,I strongly agree with the statement that menta...,1
12993,"In today’s world, where competition is highly ...",1
12994,Education is one of the most powerful tools th...,1


In [13]:
select = []
for text in org_train[org_train['generated']==0].text.values:
    if(len(text.split()) > 100):
        select.append(1)
    else:
        select.append(0)

In [14]:
select.count(1)

1375

In [15]:
train_small_0 = org_train[org_train['generated']==0].copy()
train_small_0['select'] = select

In [16]:
dataset_small_0 = train_small_0[['text','generated']].copy()
del train_small_0

In [17]:
select = []
for text in train_big[train_big['label']==0].text.values:
    if(len(text.split()) > 100):
        select.append(1)
    else:
        select.append(0)

In [18]:
select.count(1)

27371

In [19]:
train_big_0 = train_big[train_big['label']==0].copy()
train_big_0['select'] = select
train_big_0 = train_big_0[train_big_0['select']==1].sample(n=6700,random_state=42)
dataset_big_0 = train_big_0[['text','label']].copy()
dataset_big_0.rename(columns={'label':'generated'}, inplace=True)
del train_big_0

In [20]:
dataset_0 = pd.concat([dataset_big_0,dataset_small_0])
dataset_0.reset_index(drop=True, inplace=True)
del dataset_big_0
del dataset_small_0

In [21]:
dataset_1.rename(columns={'label':'generated'}, inplace=True)

In [22]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [23]:
cls, sep = tokenizer.convert_tokens_to_ids(["<cls>", "<sep>"])

In [24]:
def create_data(dataset,MAX_LEN=256):
    
    print("==========Tokenizing=============")
    sentences = dataset.text.values
    labels = dataset.generated.values
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    
    print("==========Tokens to inputs=============")
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    for inputs in input_ids:
        inputs[254] = sep
        inputs[255] = cls
    
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    
    print("==========Splitting=============")
        
    # Split into training and temporary set
    train_inputs, temp_inputs = train_test_split(input_ids,random_state=42, test_size=0.2)

    # Split the temporary set into test and dev sets
    validation_inputs, test_inputs = train_test_split(temp_inputs,random_state=42, test_size=0.5)

    del temp_inputs
    
    # Split into training and temporary set
    train_mask, temp_mask = train_test_split(attention_masks,random_state=42, test_size=0.2)

    # Split the temporary set into test and dev sets
    validation_mask, test_mask = train_test_split(temp_mask, random_state=42, test_size=0.5)

    del temp_mask
    
    return {'inputs':(train_inputs,validation_inputs,test_inputs), 'masks' : (train_mask, validation_mask, test_mask)}
    
    
    

In [25]:
out = create_data(dataset_1)



In [26]:
train_inputs_1,validation_inputs_1,test_inputs_1 = out['inputs']
train_mask_1, validation_mask_1, test_mask_1 = out['masks']

In [27]:
train_labels_1 , validation_labels_1 , test_labels_1 = (np.ones(shape=(train_inputs_1.shape[0])), 
                                                        np.ones(shape=(validation_inputs_1.shape[0])),
                                                        np.ones(shape=(test_inputs_1.shape[0]))
                                                       )

In [28]:
out = create_data(dataset_0)



In [29]:
train_inputs_0,validation_inputs_0,test_inputs_0 = out['inputs']
train_mask_0, validation_mask_0, test_mask_0 = out['masks']

In [30]:
train_labels_0 , validation_labels_0 , test_labels_0 = (np.zeros(shape=(train_inputs_0.shape[0])), 
                                                        np.zeros(shape=(validation_inputs_0.shape[0])),
                                                        np.zeros(shape=(test_inputs_0.shape[0]))
                                                       )

In [31]:
def concat_data(zeros,ones):
    zeros = pd.DataFrame(zeros)
    ones = pd.DataFrame(ones)
    out = pd.concat([ones,zeros])
    out.sample(frac=1).reset_index(drop=True)
    return np.array(out)
    

In [32]:
train_inputs = concat_data(train_inputs_0,train_inputs_1)
validation_inputs = concat_data(validation_inputs_0,validation_inputs_1)
test_inputs = concat_data(test_inputs_0,test_inputs_1)

In [33]:
train_labels = concat_data(train_labels_0, train_labels_1)
validation_labels = concat_data(validation_labels_0, validation_labels_1)
test_labels = concat_data(test_labels_0, test_labels_1)

In [34]:
train_mask = concat_data(train_mask_0, train_mask_1)
validation_mask = concat_data(validation_mask_0, validation_mask_1)
test_mask = concat_data(test_mask_0, test_mask_1)

In [35]:
np.array(train_inputs).shape

(16856, 256)

In [36]:
train_mask.shape

(16856, 256)

In [37]:
del train_inputs_0
del train_inputs_1
del test_inputs_0
del test_inputs_1
del validation_inputs_0
del validation_labels_1
del train_labels_0
del train_labels_1
del train_mask_0
del train_mask_1
del test_labels_0
del test_labels_1
del test_mask_0
del test_mask_1
del validation_mask_0
del validation_mask_1
del validation_inputs_1
del validation_labels_0

In [38]:
def create_model():
    input_ids = Input(shape=(256,), dtype='int64')
    attention_mask = Input(shape=(256,), dtype='int64')

    print('Loading XLNetModel')
    xlnetModel = TFXLNetModel.from_pretrained('xlnet-base-cased')
    conv1D_shared = Conv1D(64, kernel_size=(7), strides=(2),kernel_regularizer=tf.keras.regularizers.l2(0.01))
    batchN = BatchNormalization()
    activa = Activation('relu')
#     attent = Attention(use_scale=True)
    
    xlnetout = xlnetModel.transformer({"input_ids": input_ids, "attention_mask": attention_mask})
    x = conv1D_shared(xlnetout.last_hidden_state)
    x = batchN(x)
    x = activa(x)
#     x = attent([x, x])
    x = MaxPooling1D((3), strides=(2))(x)
    # x = Dense(1, activation="relu")(x)
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    x = Dropout(0.3)(x)
    
    model = Model(inputs=[input_ids, attention_mask,], outputs=x)
    model.summary()
    adam = Adam(learning_rate=0.00001)
    model.compile(optimizer=adam, loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['acc', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

In [39]:
    # Include the epoch in the file name (uses `str.format`)
#     checkpoint_path = "./tripletloss/training_model/cp-{epoch:04d}"
    checkpoint_path = "./model/training_model/cp-{epoch:04d}"
    checkpoint_dir = os.path.dirname(checkpoint_path)

    # Create a callback that saves the model's weights every epoch
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_freq="epoch")

#     log_dir = "./tripletloss/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = "./model/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0)
    # %load_ext tensorboard
    # %tensorboard --logdir logs/fit


In [40]:
model = create_model()

Loading XLNetModel


2023-12-26 17:03:39.640249: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-26 17:03:39.644146: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-26 17:03:39.646068: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 transformer (TFXLNetMainLa  TFXLNetModelOutput(last_hi   1167183   ['input_2[0][0]',             
 yer)                        dden_state=(None, 256, 768   36         'input_1[0][0]']             
                             ),                                                                   
                              mems=((256, None, 768),                                         

In [42]:
batch_size = 16
model.fit(x=[train_inputs,train_mask], y=train_labels,
          validation_data=([validation_inputs,validation_mask], validation_labels),
          batch_size=batch_size,epochs=4,verbose=1,
          shuffle=True,
          callbacks=[tensorboard_callback, cp_callback]
          )

Epoch 1/4

2023-12-26 17:28:59.801521: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 1: saving model to ./model/training_model/cp-0001
INFO:tensorflow:Assets written to: ./model/training_model/cp-0001/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0001/assets


Epoch 2/4

2023-12-26 17:53:46.100245: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 2: saving model to ./model/training_model/cp-0002
















































INFO:tensorflow:Assets written to: ./model/training_model/cp-0002/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0002/assets


Epoch 3/4

2023-12-26 18:17:58.016324: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 3: saving model to ./model/training_model/cp-0003
















































INFO:tensorflow:Assets written to: ./model/training_model/cp-0003/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0003/assets


Epoch 4/4

2023-12-26 18:42:10.848395: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 4: saving model to ./model/training_model/cp-0004
















































INFO:tensorflow:Assets written to: ./model/training_model/cp-0004/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0004/assets




<keras.src.callbacks.History at 0x7f82b02b6370>

In [None]:
# preds = model.predict(x=[test_inputs,np.array(test_mask)])

In [None]:
# preds = preds.reshape(-1,)

In [47]:
model_dir = './model/training_model/cp-0002'

In [48]:
xlnetModel = tf.keras.models.load_model(model_dir)



In [49]:
preds = xlnetModel.predict(x=[test_inputs,np.array(test_mask)])



In [50]:
preds = preds.reshape(-1,)

In [51]:
from sklearn.metrics import roc_auc_score

In [52]:
auc_roc_score = roc_auc_score(test_labels,preds)
print("AUC-ROC Score:", auc_roc_score)

AUC-ROC Score: 0.9957854150799695
