In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

data_sets = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    
    for filename in filenames:
        print(filename)
        if filename == "test_essays.csv":
            test = pd.read_csv(os.path.join(dirname, filename))
        if filename == "train_essays.csv":
            train = pd.read_csv(os.path.join(dirname, filename))
        if filename == "train_essays_RDizzl3_seven_v2.csv":
            generated_text = pd.read_csv(os.path.join(dirname, filename))
        data_sets.append(pd.read_csv(os.path.join(dirname, filename)))
    print(f"{len(data_sets)} data sets loaded in.")
    
    
#test = train[10:100]
#sample_sub, training_prompts, test, train, _, _, _, generated_text = data_sets

0 data sets loaded in.
train_essays_RDizzl3_seven_v2.csv
train_essays_7_prompts_v2.csv
train_essays_7_prompts.csv
train_essays_RDizzl3_seven_v1.csv
4 data sets loaded in.
sample_submission.csv
train_prompts.csv
test_essays.csv
train_essays.csv
8 data sets loaded in.


In [2]:
print("Supplementary dataset features:")
print(generated_text.dtypes)
print("\nOriginal dataset features:")
print(train.dtypes)


Supplementary dataset features:
text     object
label     int64
dtype: object

Original dataset features:
id           object
prompt_id     int64
text         object
generated     int64
dtype: object


As we can see, we need to make it so that these share a universal format so that we can combine the data together.

In [3]:
test

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [4]:
# Rename the label column to match original dataset
generated_text.rename(columns={'label': 'generated'}, inplace=True)
generated_text

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
17246,"Dear Senator,\n\nI am writing to you today to ...",1
17247,"Dear Senator,\n\nI am writing to you today to ...",1
17248,"Dear Senator,\n\nI am writing to you today to ...",1
17249,"Dear Senator,\n\nI am writing to you today to ...",1


In [5]:
# Drop the two columns we don't need for training
train.drop('id', axis='columns', inplace=True)
train.drop('prompt_id', axis='columns', inplace=True)
train

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [6]:
training_set = pd.concat([generated_text, train])

In [7]:
training_set['generated'].value_counts()

generated
0    15622
1     3007
Name: count, dtype: int64

## Undersample to get an even distribution

In [8]:
pos = training_set[training_set['generated']==1]
neg = training_set[training_set['generated']==0]
neg = neg.sample(n=len(pos), random_state = 21)

training_set = pd.concat([pos, neg])

In [9]:
training_set['generated'].value_counts()

generated
1    3007
0    3007
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
import keras_nlp
import keras_core as keras

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
   

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping

Using TensorFlow backend


In [11]:
isRealTest = True

if isRealTest:
    X_train = training_set['text'].values
    y_train = training_set['generated'].values
    
    X_test = test['text'].values
    X_testIDs = test['id']
    
    
else:
    X = training_set['text']
    y = training_set['generated']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
X_train[60]

"Advantages of Limiting Car Usage\n\nThere are numerous advantages to limiting car usage, as highlighted in the passages provided. One advantage is the reduction in greenhouse gas emissions, which contribute to climate change. The passage states that passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe and up to 50 percent in some car-intensive areas in the United States. By limiting car usage, especially in suburban areas, where automobile dependency is high, we can significantly reduce these emissions.\n\nAnother advantage is the promotion of alternative modes of transportation, such as walking, cycling, and public transportation. The passages mention examples of car-free communities, like Vauban in Germany and car-free days in Bogota, Colombia. In these communities, residents have the option to walk or cycle to their destinations, reducing traffic congestion and improving air quality. Additionally, public transportation becomes more accessible and effi

In [13]:
max_words = 10000
padding_length = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Tokenize them into vectors
# Each unique word represented by an integer
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad our data to ensure consistent length
X_train_padded = pad_sequences(X_train_sequences, maxlen=padding_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=padding_length, padding='post', truncating='post')

In [14]:
print(X_test)

['Aaa bbb ccc.' 'Bbb ccc ddd.' 'CCC ddd eee.']


### Visualizing our preprocessing

Now we can see the three steps our data goes through


In [15]:
essay_number = 13

print("Step 1: Raw Text\n"+ X_train[essay_number])
print("\n\nStep 2: Tokens\n" + str(X_train_sequences[essay_number]))
print("\n\nStep 3: Padded Set\n" + str(X_train_padded[essay_number]))

print("Generated? " + str(bool(y_train[essay_number])))

Step 1: Raw Text
It is becoming increasingly evident that there are numerous advantages to limiting car usage in modern society. The passage set provides several examples of communities and cities that are taking steps to reduce dependence on cars, and the positive outcomes they have experienced. Limiting car usage not only proves to be beneficial for the environment but also for the overall well-being of individuals and communities.

One key advantage of limiting car usage is the reduction of greenhouse gas emissions. According to the passages, passenger cars are responsible for a significant percentage of greenhouse gas emissions in both Europe and the United States. By reducing the number of cars on the road, we can effectively decrease these emissions and work towards combating climate change. The passage on Paris's driving ban due to smog highlights the positive impact of such measures. The ban resulted in a significant decrease in congestion and smog levels, improving the air qua

In [16]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss =tf.keras.losses.BinaryCrossentropy(), metrics=[BinaryAccuracy()])

In [17]:
y_train = y_train.astype(np.float32)
if not isRealTest:
    y_test  = y_test .astype(np.float32)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
if not isRealTest:
    # Evaluate the model on the test set
    loss, accurtestcy = model.evaluate(X_test_padded, y_test)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [19]:
from sklearn.metrics import confusion_matrix, classification_report

# Predictions on the test set
y_pred = model.predict(X_test_padded)

# Convert probabilities to class labels
y_pred_labels = np.round(y_pred)



In [20]:
if not isRealTest:
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_labels)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Classification report
    class_report = classification_report(y_test, y_pred_labels)
    print("\nClassification Report:")
    print(class_report)

if isRealTest:
    submission = pd.DataFrame({'id': list(X_testIDs), 'generated': y_pred.flatten()})

In [21]:
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [22]:
submission

Unnamed: 0,id,generated
0,0000aaaa,0.007432
1,1111bbbb,0.007432
2,2222cccc,0.007432
