In [1]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split

In [2]:
def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    """Fonction pour fusionner plusieurs fichiers JSONL en un seul DataFrame pandas"""
    # Récupérer la liste des fichiers correspondant au modèle fourni
    files = glob(file_pattern)
    
    
    # Lire chaque fichier JSONL et stocker les DataFrames individuels dans une liste
    dfs = [jsonl_to_dataframe(file) for file in files]
    # Concaténer les DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

# Modèle de chemin vers les fichiers JSONL à fusionner
file_pattern = "data/*.jsonl"

# Appel de la fonction pour fusionner les fichiers JSONL en un DataFrame
df = merge_jsonl_to_dataframe(file_pattern)

# Afficher le DataFrame fusionné


In [16]:
# Creating the answers_df
human_df = pd.DataFrame({'answers': df['human_answers'], 'generated': 1})
gpt_df = pd.DataFrame({'answers': df['chatgpt_answers'], 'generated': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

# Checking the len
answers_df['len'] = answers_df['answers'].apply(len)
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'])

answers_df

Unnamed: 0,answers,generated,len
0,There is most likely an error in the WSJ's dat...,1,1
1,I know this question has a lot of answers alre...,1,1
2,If you pay it off before the cycle closes it w...,1,1
3,It is the first time I encounter redemption pr...,1,1
4,Why Investors Buy Platinum is an old (1995) ar...,1,1
...,...,...,...
171803,It's not uncommon for blood pressure to fluctu...,0,1
171804,There are several possible causes of a painles...,0,1
171805,It is not appropriate for me to recommend a sp...,0,1
171806,It is not uncommon for people with rheumatoid ...,0,1


In [19]:
def Clean(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    text = tf.strings.strip(text)
    text = tf.strings.regex_replace(text, '\.\.\.', ' ')
    text = tf.strings.join(['',text, ''], separator=' ')
    return text
def clean_text(text):
    # Remove Twitter handles starting with '@'
    text = re.sub(r'@\w+', '', text)
    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert multiple whitespace characters to a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    return text

In [22]:
import tensorflow as tf
import re
import tensorflow_text as tf_text

max_features = 75000
embedding_dim = 64
sequence_length = 512*2
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=Clean ,
    max_tokens=max_features,
    ngrams = (3,5),
    output_mode="int",
    output_sequence_length=sequence_length,
    pad_to_max_tokens=True
)
vectorize_layer.adapt(answers_df['answers'])
Text = vectorize_layer(answers_df['answers']).numpy()
Text

array([[    1,  8352,     1, ...,     0,     0,     0],
       [12078,     1, 70851, ...,     0,     0,     0],
       [ 7783, 48099,  7530, ...,     0,     0,     0],
       ...,
       [   29,  5367, 10065, ...,     0,     0,     0],
       [   29,  1738,  1084, ...,     0,     0,     0],
       [   29,  1738, 33504, ...,     0,     0,     0]])

In [26]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=44)
Text,labels= sm.fit_resample(Text,answers_df['generated'])

In [29]:
#X = answers_df['answers']
#y = answers_df['is_human']

X_train, X_test, y_train, y_test = train_test_split(Text, labels, test_size=0.2, random_state=42) 
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

X_train shape is  (187347, 1024)
X_test shape is  (46837, 1024)
y_train shape is  (187347,)
y_test shape is  (46837,)


In [35]:
from tensorflow.keras import Model, Input, layers
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
inputs = Input(shape=(sequence_length,), dtype="int64")
x = Embedding(max_features, embedding_dim)(inputs)
x = Bidirectional(LSTM(32, return_sequences=True))(x)
transformer_block = TransformerBlock(embedding_dim, 2, 32)
x = transformer_block(x, training=True)
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation="sigmoid", name="predictions")(x)

model = Model(inputs=inputs, outputs=predictions)
model.summary()

In [39]:
from keras.callbacks import EarlyStopping,ModelCheckpoint

checkpoint_cb =ModelCheckpoint("model.keras", save_best_only=True)
early_stopping_cb =EarlyStopping(patience=3, restore_best_weights=True)
model.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train,y_train, epochs=10, validation_split=.1, callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/10
[1m  92/5270[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:48:37[0m 1s/step - accuracy: 0.6628 - loss: 0.6349

KeyboardInterrupt: 

In [40]:
hist_=pd.DataFrame(hist.history)
hist_

NameError: name 'hist' is not defined

In [41]:
score, acc= model.evaluate(X_test,y_test)
print('Test Loss =', score)
print('Test Accuracy =', acc)

[1m  26/1464[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:39[0m 528ms/step - accuracy: 0.8993 - loss: 0.2701

KeyboardInterrupt: 

In [42]:
predictions = model.predict(X_test)
y_pred = np.where(predictions>=.5,1,0)
y_test_1d = np.ravel(y_test)
y_pred_1d = np.ravel(y_pred)
df = pd.DataFrame({'Actual': y_test_1d, 'Prediction': y_pred_1d})
df

[1m  11/1464[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:32[0m 559ms/step

KeyboardInterrupt: 

In [46]:
from sklearn.metrics import confusion_matrix
CM = confusion_matrix(y_test_1d,y_pred_1d)
CM_percent = CM.astype('float') / CM.sum(axis=1)[:, np.newaxis]
sns.heatmap(CM_percent,fmt='g',center = True,cbar=False,annot=True,cmap='Blues')
CM

NameError: name 'y_test_1d' is not defined

In [45]:
ClassificationReport = classification_report(y_test_1d,y_pred_1d)
print('Classification Report is : ', ClassificationReport )

NameError: name 'classification_report' is not defined