In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

data_sets = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    
    for filename in filenames:
        print(filename)
        data_sets.append(pd.read_csv(os.path.join(dirname, filename)))
    print(f"{len(data_sets)} data sets loaded in.")
sample_sub, training_prompts, test, train, _, _, _, generated_text = data_sets

0 data sets loaded in.
sample_submission.csv
train_prompts.csv
test_essays.csv
train_essays.csv
4 data sets loaded in.
train_essays_RDizzl3_seven_v2.csv
train_essays_7_prompts_v2.csv
train_essays_7_prompts.csv
train_essays_RDizzl3_seven_v1.csv
8 data sets loaded in.


In [19]:
print("Supplementary dataset features:")
print(generated_text.dtypes)
print("\nOriginal dataset features:")
print(train.dtypes)


Supplementary dataset features:
text     object
label     int64
dtype: object

Original dataset features:
id           object
prompt_id     int64
text         object
generated     int64
dtype: object


As we can see, we need to make it so that these share a universal format so that we can combine the data together.

In [20]:
# Rename the label column to match original dataset
generated_text.rename(columns={'label': 'generated'}, inplace=True)
generated_text

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
15866,"While some find the ""Face on Mars"" imaged by t...",1
15867,Limiting car usage has many benefits for moder...,1
15868,The Rise of Driverless Cars\n\nThe development...,1
15869,The Open Sea Beckons\n\nThe Seagoing Cowboys p...,1


In [21]:
# Drop the two columns we don't need for training
train.drop('id', axis='columns', inplace=True)
train.drop('prompt_id', axis='columns', inplace=True)
train

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [22]:
training_set = pd.concat([generated_text, train])

In [23]:
training_set.describe()

Unnamed: 0,generated
count,17249.0
mean,0.09415
std,0.292046
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [24]:
from sklearn.model_selection import train_test_split
import keras_nlp
import keras_core as keras

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
   

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping

In [40]:
isRealTest = True

if isRealTest:
    X_train = training_set['text']
    X_test = test['text']
    X_testIDs = test['id']
    y_train = training_set['text']
    
else:
    X = training_set['text']
    y = training_set['generated']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [44]:
max_words = 10000
padding_length = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Tokenize them into vectors
# Each unique word represented by an integer
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad our data to ensure consistent length
X_train_padded = pad_sequences(X_train_sequences, maxlen=padding_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=padding_length, padding='post', truncating='post')

In [45]:
print(X_test)

0    Aaa bbb ccc.
1    Bbb ccc ddd.
2    CCC ddd eee.
Name: text, dtype: object


### Visualizing our preprocessing

Now we can see the three steps our data goes through


In [46]:
X_train[4]

4    Cars are a wonderful thing. They are perhaps o...
4    Cars are a wonderful thing. They are perhaps o...
Name: text, dtype: object

In [29]:
essay_number = 5

print("Step 1: Raw Text\n"+ X_train[essay_number])
print("\n\nStep 2: Tokens\n" + str(X_train_sequences[essay_number]))
print("\n\nStep 3: Padded Set\n" + str(X_train_padded[essay_number]))

print("Generated: " + str(y_train[essay_number]))

5    Step 1: Raw Text\nThe electrol college system ...
5    Step 1: Raw Text\nThe electrol college system ...
Name: text, dtype: object


Step 2: Tokens
[224, 615, 654, 485, 1399, 1, 76, 31, 203, 163, 549, 192, 163, 9, 25, 17, 532, 106, 53, 145, 15, 56, 752, 4, 117, 101, 140, 3, 17, 50, 25, 309, 219, 5, 244, 9, 935, 2004, 30, 101, 314, 599, 29, 942, 66, 389, 935, 152, 7247, 61, 16, 13, 283, 379, 50, 18, 96, 61, 3, 276, 3, 106, 3, 17, 4678, 4482, 7, 14, 963, 27, 344, 140, 3, 1095, 61, 66, 5565, 8, 273, 6, 645, 60, 61, 388, 24, 288, 135, 487, 389, 526, 115, 7, 2, 169, 3531, 52, 43, 4, 3635, 16, 8309, 15, 3, 63, 67, 640, 7, 2, 169, 24, 1, 67, 163, 33, 181, 2, 778, 3, 244, 71, 8020, 66, 6, 33, 1, 67, 66, 2, 169, 24, 240, 9, 67, 19, 167, 5, 112, 117, 29, 167, 5, 264, 29, 51, 346, 24, 12, 599, 39, 9, 82, 16, 186, 1, 63, 16, 118, 67, 1, 666, 61, 16, 186, 549, 16, 8309, 15, 150, 27, 344, 140, 3, 240, 16, 136, 509, 7, 2, 1858, 224, 1, 654, 485, 18, 1, 645, 60, 61, 388, 24, 288, 67, 1, 33, 1, 67

In [31]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss =tf.keras.losses.BinaryCrossentropy(), metrics=[BinaryAccuracy()])

In [32]:
y_train = y_train.astype(np.float32)
y_test  = y_test .astype(np.float32)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
if not isRealTest:
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test_padded, y_test)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [79]:
from sklearn.metrics import confusion_matrix, classification_report

# Predictions on the test set
y_pred = model.predict(X_test_padded)

# Convert probabilities to class labels
y_pred_labels = np.round(y_pred)



In [87]:
print(y_pred[:])

[[0.0031958]
 [0.0031958]
 [0.0031958]]


In [88]:



if not isRealTest:
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_labels)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Classification report
    class_report = classification_report(y_test, y_pred_labels)
    print("\nClassification Report:")
    print(class_report)

if isRealTest:
    submission = pd.DataFrame({'id': X_testIDs[:][1], 'generated': y_pred.flatten()})

In [89]:
submission

Unnamed: 0,id,generated
0,1111bbbb,0.003196
1,1111bbbb,0.003196
2,1111bbbb,0.003196
