In [137]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

data_sets = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    
    for filename in filenames:
        print(filename)
        data_sets.append(pd.read_csv(os.path.join(dirname, filename)))
    print(f"{len(data_sets)} data sets loaded in.")
_, _, _, generated_text, sample_sub, training_prompts, test, train = data_sets

0 data sets loaded in.
train_essays_RDizzl3_seven_v2.csv
train_essays_7_prompts_v2.csv
train_essays_7_prompts.csv
train_essays_RDizzl3_seven_v1.csv
4 data sets loaded in.
sample_submission.csv
train_prompts.csv
test_essays.csv
train_essays.csv
8 data sets loaded in.


In [138]:
print("Supplementary dataset features:")
print(generated_text.dtypes)
print("\nOriginal dataset features:")
print(train.dtypes)


Supplementary dataset features:
text     object
label     int64
dtype: object

Original dataset features:
id           object
prompt_id     int64
text         object
generated     int64
dtype: object


As we can see, we need to make it so that these share a universal format so that we can combine the data together.

In [139]:
# Rename the label column to match original dataset
generated_text.rename(columns={'label': 'generated'}, inplace=True)
generated_text

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
15866,"While some find the ""Face on Mars"" imaged by t...",1
15867,Limiting car usage has many benefits for moder...,1
15868,The Rise of Driverless Cars\n\nThe development...,1
15869,The Open Sea Beckons\n\nThe Seagoing Cowboys p...,1


In [140]:
# Drop the two columns we don't need for training
train.drop('id', axis='columns', inplace=True)
train.drop('prompt_id', axis='columns', inplace=True)
train

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [141]:
training_set = pd.concat([generated_text, train])

In [142]:
training_set.describe()

Unnamed: 0,generated
count,17249.0
mean,0.09415
std,0.292046
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [143]:
from sklearn.model_selection import train_test_split
import keras_nlp
import keras_core as keras

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping

In [144]:
X = training_set['text']
y = training_set['generated']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [145]:
max_words = 10000
padding_length = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Tokenize them into vectors
# Each unique word represented by an integer
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad our data to ensure consistent length
X_train_padded = pad_sequences(X_train_sequences, maxlen=padding_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=padding_length, padding='post', truncating='post')

In [146]:
print(X_test)

259      Dear State Senate, The Electoral College is in...
13137    Dear State Senator, We should forget about the...
9249     The article Making Mona Lisa smile by Nick D'A...
15182    Introduction:\n\nHave you ever wondered what i...
4551     Being a Seagoing Cow is great way to get to kn...
                               ...                        
776      When it comes to voting citizens from all over...
15208    "The Challenge of Exploring Venus" is an inter...
15681    Limiting car usage has several advantages that...
553      To whom this may concern, The "winner takes al...
9539     In 1976, NASA's Viking 1 spacecraft snapped a ...
Name: text, Length: 3450, dtype: object


### Visualizing our preprocessing

Now we can see the three steps our data goes through


In [147]:
X_train[4]

4    Cars are a wonderful thing. They are perhaps o...
4    Cars are a wonderful thing. They are perhaps o...
Name: text, dtype: object

In [148]:
essay_number = 5

print("Step 1: Raw Text\n"+ X_train[essay_number])
print("\n\nStep 2: Tokens\n" + str(X_train_sequences[essay_number]))
print("\n\nStep 3: Padded Set\n" + str(X_train_padded[essay_number]))

print("Generated: " + str(y_train[essay_number]))

5    Step 1: Raw Text\nThe electrol college system ...
5    Step 1: Raw Text\nThe electrol college system ...
Name: text, dtype: object


Step 2: Tokens
[58, 402, 861, 888, 3, 1423, 1189, 210, 21, 80, 24, 1345, 1309, 5, 403, 7, 930, 2885, 627, 883, 13, 53, 3354, 87, 999, 6, 70, 20, 109, 60, 44, 118, 3, 118, 1622, 54, 23, 354, 170, 207, 2, 798, 3, 5, 1767, 1116, 4, 419, 280, 4, 277, 35, 64, 255, 10, 8, 573, 961, 9, 2, 125, 6, 75, 984, 178, 22, 457, 8, 275, 11, 2, 1034, 6, 2, 20, 490, 35, 2, 837, 1895, 56, 13, 2, 232, 4, 127, 21, 80, 54, 164, 9, 883, 805, 2, 119, 87, 21, 24, 12, 119, 275, 11, 2, 1034, 257, 3, 2, 133, 172, 795, 78, 237, 3, 143, 10, 637, 61, 678, 1152, 125, 773, 172, 581, 5, 656, 78, 296, 14, 40, 3, 70, 1074, 2, 143, 1, 126, 7, 2, 1959, 180, 356, 54, 2407, 757, 3010, 60, 683, 2672, 411, 773, 2, 1, 2953, 34, 501, 132, 34, 1021, 9, 1110, 2, 143, 1220, 270, 11, 2, 2290, 1558, 525, 3, 2554, 2, 296, 11, 960, 854, 612, 16, 1609, 7, 150, 133, 388, 21, 216, 118, 8, 1131, 142, 5, 2

In [149]:
# Fetch the model from TF hub
# bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
# bert_layer = hub.KerasLayer(bert_url, trainable=True)

# model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(100,), dtype=tf.int32),
#     bert_layer,
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dense(2, activation='softmax')
# ])

In [150]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss =tf.keras.losses.BinaryCrossentropy(), metrics=[BinaryAccuracy()])

In [151]:
y_train = y_train.astype(np.float32)
y_test  = y_test .astype(np.float32)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [152]:
from sklearn.metrics import confusion_matrix, classification_report

# Predictions on the test set
y_pred = model.predict(X_test_padded)

# Convert probabilities to class labels
y_pred_labels = np.round(y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred_labels)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[3116    4]
 [  10  320]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3120
         1.0       0.99      0.97      0.98       330

    accuracy                           1.00      3450
   macro avg       0.99      0.98      0.99      3450
weighted avg       1.00      1.00      1.00      3450

