In [2]:
from transformers import XLNetTokenizer , TFXLNetModel
from tqdm import tqdm
import pandas as pd 
import numpy as np
import random
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Attention, Input, Dense, LSTM, Embedding, Bidirectional, Dropout, TimeDistributed, concatenate, MaxPooling1D, Activation, Add, Flatten, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
import os
import datetime

2023-12-25 05:41:56.583207: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
test = pd.read_csv('Datasets/test_essays.csv')
sub = pd.read_csv('Datasets/sample_submission.csv')
org_train = pd.read_csv('Datasets/train_essays.csv')
train = pd.read_csv("Datasets/train_v2_drcat_02.csv", sep=',')

In [4]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [5]:
org_train.rename(columns = {'generated':'label'}, inplace=True)

In [6]:
train_data = pd.concat([train[['text','label']], org_train[['text','label']]])

In [7]:
sentences = train_data.text.values

In [8]:
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = train_data.label.values

In [9]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['▁phones', '▁modern', '▁humans', '▁today', '▁are', '▁always', '▁on', '▁their', '▁phone', '.', '▁they', '▁are', '▁always', '▁on', '▁their', '▁phone', '▁more', '▁than', '▁5', '▁hours', '▁a', '▁day', '▁no', '▁stop', '▁', '.', 'all', '▁they', '▁do', '▁is', '▁text', '▁back', '▁and', '▁forward', '▁and', '▁just', '▁have', '▁group', '▁chat', 's', '▁on', '▁social', '▁media', '.', '▁they', '▁even', '▁do', '▁it', '▁while', '▁driving', '.', '▁they', '▁are', '▁some', '▁really', '▁bad', '▁consequences', '▁when', '▁stuff', '▁happens', '▁when', '▁it', '▁comes', '▁to', '▁a', '▁phone', '.', '▁some', '▁certain', '▁areas', '▁in', '▁the', '▁united', '▁states', '▁ban', '▁phones', '▁from', '▁class', '▁rooms', '▁just', '▁because', '▁of', '▁it', '.', '▁when', '▁people', '▁have', '▁phones', ',', '▁they', '▁know', '▁about', '▁certain', '▁apps', '▁that', '▁they', '▁have', '▁', '.', 'app', 's', '▁like', '▁face', 'book', '▁twitter', '▁in', 'sta', 'gram', '▁and', '▁snap', 'cha', 't', '.

In [10]:
avg_word_approx = 0
for text in tokenized_texts:
    avg_word_approx+= len(text)
print(avg_word_approx/len(tokenized_texts))

478.03323530683735


In [11]:
MAX_LEN = 256

In [12]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [13]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [14]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [15]:
# Split into training and temporary set
train_inputs, temp_inputs, train_labels, temp_labels = train_test_split(input_ids, labels, 
                                                                         random_state=42, test_size=0.2)

# Split the temporary set into test and dev sets
validation_inputs, test_inputs, validation_labels, test_labels = train_test_split(temp_inputs, temp_labels,
                                                                                  random_state=42, test_size=0.5)

del temp_inputs
del temp_labels

In [16]:
# Split into training and temporary set
train_mask, temp_mask_1 = train_test_split(attention_masks,random_state=42, test_size=0.2)

# Split the temporary set into test and dev sets
validation_mask, test_mask = train_test_split(temp_mask_1, random_state=42, test_size=0.5)

del temp_mask_1
# del temp_mask_2

In [27]:
def create_model():
    input_ids = Input(shape=(256,), dtype='int64')
    attention_mask = Input(shape=(256,), dtype='int64')

    print('Loading XLNetModel')
    xlnetModel = TFXLNetModel.from_pretrained('xlnet-base-cased')
    conv1D_shared = Conv1D(64, kernel_size=(7), strides=(2))
    batchN = BatchNormalization()
    activa = Activation('relu')
#     attent = Attention(use_scale=True)
    
    xlnetout = xlnetModel.transformer({"input_ids": input_ids, "attention_mask": attention_mask})
    x = conv1D_shared(xlnetout.last_hidden_state)
    x = batchN(x)
    x = activa(x)
#     x = attent([x, x])
    x = MaxPooling1D((3), strides=(2))(x)
    # x = Dense(1, activation="relu")(x)
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=[input_ids, attention_mask,], outputs=x)
    model.summary()
    adam = Adam(learning_rate=0.00001)
    model.compile(optimizer=adam, loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['acc', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

In [28]:
    # Include the epoch in the file name (uses `str.format`)
#     checkpoint_path = "./tripletloss/training_model/cp-{epoch:04d}"
    checkpoint_path = "./model/training_model/cp-{epoch:04d}"
    checkpoint_dir = os.path.dirname(checkpoint_path)

    # Create a callback that saves the model's weights every epoch
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_freq="epoch")

#     log_dir = "./tripletloss/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = "./model/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0)
    # %load_ext tensorboard
    # %tensorboard --logdir logs/fit


In [29]:
model = create_model()

Loading XLNetModel


Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_5 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 transformer (TFXLNetMainLa  TFXLNetModelOutput(last_hi   1167183   ['input_6[0][0]',             
 yer)                        dden_state=(None, 256, 768   36         'input_5[0][0]']             
                             ),                                                                   
                              mems=((256, None, 768),                                       

In [30]:
batch_size = 16
model.fit(x=[train_inputs, np.array(train_mask)], y=train_labels,
          validation_data=([validation_inputs,np.array(validation_mask)], validation_labels),
          batch_size=batch_size,epochs=2,verbose=1,
          callbacks=[tensorboard_callback, cp_callback])

Epoch 1/2

2023-12-24 22:30:22.855447: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 1: saving model to ./model/training_model/cp-0001
INFO:tensorflow:Assets written to: ./model/training_model/cp-0001/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0001/assets


Epoch 2/2

2023-12-24 23:23:13.780470: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5898240000 exceeds 10% of free system memory.



Epoch 2: saving model to ./model/training_model/cp-0002
















































INFO:tensorflow:Assets written to: ./model/training_model/cp-0002/assets


INFO:tensorflow:Assets written to: ./model/training_model/cp-0002/assets




<keras.src.callbacks.History at 0x7f9c30103940>

In [31]:
model.save("xlnet-cnn-try-1")

















































INFO:tensorflow:Assets written to: xlnet-cnn-try-1/assets


INFO:tensorflow:Assets written to: xlnet-cnn-try-1/assets


In [37]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_5 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 transformer (TFXLNetMainLa  TFXLNetModelOutput(last_hi   1167183   ['input_6[0][0]',             
 yer)                        dden_state=(None, 256, 768   36         'input_5[0][0]']             
                             ),                                                                   
                              mems=((256, None, 768),                                       

In [36]:
preds = model.predict(x=[test_inputs,np.array(test_mask)])



In [41]:
preds = preds.reshape(-1,)

In [17]:
model_dir = './model/training_model/cp-0002'

In [18]:
xlnetModel = tf.keras.models.load_model(model_dir)

2023-12-25 05:45:36.228232: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-25 05:45:36.231882: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-25 05:45:36.233095: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [22]:
preds = xlnetModel.predict(x=[test_inputs,np.array(test_mask)])

  1/145 [..............................] - ETA: 7:11

2023-12-25 05:46:47.097409: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8700




In [23]:
preds = preds.reshape(-1,)

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
auc_roc_score = roc_auc_score(test_labels,preds)
print("AUC-ROC Score:", auc_roc_score)

AUC-ROC Score: 0.999793645465137
