In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import time

E0000 00:00:1737769611.748407    5205 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737769611.768906    5205 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
loaded_encoder = np.load('encoder.npz')
one_hot_encoder = {key: loaded_encoder[key] for key in loaded_encoder}

# Human written texts

In [4]:
NUM_PER_CLASS: int = 3000

In [5]:
df_train = pd.read_csv("dataset/train.csv", sep = ";")
df_test = pd.read_csv("dataset/test.csv", sep = ";")
df_eval = pd.read_csv("dataset/evaluation.csv", sep = ";")
texts1 = pd.concat([df_train, df_eval, df_test]).dropna()["text"].sample(NUM_PER_CLASS, ignore_index=True, random_state=42)

In [6]:
df_true = pd.read_csv("dataset/True.csv")
df_fake = pd.read_csv("dataset/Fake.csv")
texts2 = pd.concat([df_true, df_fake]).dropna()["text"].sample(NUM_PER_CLASS, ignore_index=True, random_state=42)

In [7]:
human_texts = pd.concat([texts1, texts2]).to_frame(name="text")
human_texts["label"] = np.float32(0.0)

In [8]:
human_texts.reset_index(drop=True, inplace=True)

In [9]:
human_texts

Unnamed: 0,text,label
0,NEW YORK (Reuters) - Donald Trump’s support ha...,0.0
1,JERUSALEM (Reuters) - The Israeli military sai...,0.0
2,Tune in to the Alternate Current Radio Network...,0.0
3,We know that the BP Deepwater Horizon oil spil...,0.0
4,The trend of young Muslim men targeting and at...,0.0
...,...,...
5995,"HABUR BORDER CROSSING, Turkey (Reuters) - Turk...",0.0
5996,WASHINGTON (Reuters) - Republican Donald Trump...,0.0
5997,"21st Century Wire says Earlier this week, the ...",0.0
5998,The Walt Disney Company the same group that ...,0.0


In [10]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [11]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [12]:
MAX_TEXT_LEN: int = 150

In [13]:
def shrink_text(text: str) -> str:
    wrds = text.split(" ")
    retVal = ""
    for wrd in wrds:
        if len(retVal + " " + wrd) > MAX_TEXT_LEN - 1:
            return retVal.strip() + "." 
        else:
            retVal += " " + wrd

In [14]:
human_texts["text"] = human_texts["text"].apply(lambda x: filter_chars(x)).apply(lambda x: shrink_text(x))

# AI generated texts

## Prepare first words

In [15]:
def encode(src: str) -> np.ndarray:
    dst = []
    for letter in [*src]:
        dst.append(one_hot_encoder[letter])
    return np.array(dst)

In [16]:
def decode(src: np.ndarray) -> str:
    dst = ""
    for coded in src:
        for key, val in one_hot_encoder.items():
            if (coded == val).all():
                dst += key
                break
    return dst

In [17]:
def encode_generate_sequence(model, start_sequence):
    encoded = encode(start_sequence)
    current_input = np.copy(encoded)
    generated_sequence = start_sequence
    for _ in range(MAX_TEXT_LEN):
        predictions = model.predict(current_input[np.newaxis, ...], verbose=0) 
        next_char_idx = np.argmax(predictions[0, -1, :])
        next_char_onehot = np.zeros_like(current_input[0])
        next_char_onehot[next_char_idx] = 1
        current_input = np.roll(current_input, shift=-1, axis=0)
        current_input[-1] = next_char_onehot  
        generated_sequence += decode(next_char_onehot[np.newaxis, ...])
    return generated_sequence

In [18]:
def get_first_word(text: str) -> str:
    return text.split()[0] + " "

In [19]:
texts1_list = pd.concat([df_train, df_eval, df_test]).dropna()["text"]
texts1_list = texts1_list.loc[texts1_list.str.strip() != ""].sample(NUM_PER_CLASS, random_state=42).tolist()

In [20]:
texts2_list = pd.concat([df_true, df_fake]).dropna()["text"]
texts2_list = texts2_list.loc[texts2_list.str.strip() != ""].sample(NUM_PER_CLASS, random_state=42).tolist()

In [21]:
texts_list = np.concatenate([texts1_list, texts2_list])
texts_list.shape

(6000,)

In [22]:
texts_list = np.vectorize(filter_chars)(texts_list)

In [23]:
first_words = np.vectorize(get_first_word)(texts_list)

In [24]:
np.random.shuffle(first_words)

## First generator

In [25]:
model_first = load_model("generator_dropout_complex.keras")

W0000 00:00:1737769617.250864    5205 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [26]:
model_first.summary()

In [27]:
s1 = time.time()
generated_first = np.vectorize(lambda x: encode_generate_sequence(model_first, x))(first_words[:NUM_PER_CLASS])
e1 = time.time()

In [28]:
print(f"Took: {e1 - s1} secods")

Took: 17104.259920597076 secods


In [29]:
np.save("dataset/generated_first.npy", generated_first)

## Second generator

In [30]:
model_second = load_model("generator_dropout_complex_shrink_seq_other.keras")
model_second.summary()

In [31]:
s2 = time.time()
generated_second = np.vectorize(lambda x: encode_generate_sequence(model_second, x))(first_words[NUM_PER_CLASS:])
e2 = time.time()

In [32]:
print(f"Took: {e2 - s2} secods")

Took: 17461.15230846405 secods


In [33]:
np.save("dataset/generated_second.npy", generated_second)

## Final dataset processing 

In [36]:
df_gen = pd.DataFrame(np.concatenate([generated_first, generated_second]), columns=["text"])

In [39]:
df_gen["label"] = np.float32(1.0)

In [40]:
df_gen.head()

Unnamed: 0,text,label
0,CARACAS STARE SHOULD TRUMP SUPPORTER SHOCKING ...,1.0
1,The U.S. Senato testitil anti-to realinto test...,1.0
2,Just Senate three state of the U.S. state of t...,1.0
3,We ex-prose the mary top the mary top the mary...,1.0
4,WARSAW AND CHARLESS THE SEAN Senate candidate ...,1.0


In [41]:
df_dataset = pd.concat([human_texts, df_gen])

In [48]:
df_dataset.drop(columns=["index"], inplace=True)

In [50]:
df_dataset.to_csv("dataset/human-ai-gen-news.csv", sep=";")