# Using Bert!

In [1]:
import numpy as np
import pandas as pd
import os
import random
os.environ["TF_USE_LEGACY_KERAS"] = "True"


def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [4]:
import numpy as np
import pandas as pd
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import ktrain
from ktrain import text


# Load dos dados
csv_path = '../../datasets/human_or_ai_dataset_small_research_only.csv'  # Change this to your file path
df = pd.read_csv(csv_path)
# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# Split the DataFrame (80% train, 20% test)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=25)

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')


In [5]:
# text.texts_from_df return two tuples
# maxlen means it is considering that much words and rest are getting trucated
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)

(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=df_train,
                                                                   text_column = 'text',
                                                                   label_columns = 'source',
                                                                   val_df = df_test,
                                                                   maxlen = 500,
                                                                   preprocess_mode = 'bert')

['ai', 'human']
       ai  human
2238  1.0    0.0
1971  0.0    1.0
1882  1.0    0.0
429   1.0    0.0
2693  0.0    1.0
['ai', 'human']
       ai  human
4832  1.0    0.0
483   1.0    0.0
2011  0.0    1.0
2823  0.0    1.0
2394  1.0    0.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [10]:
model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

#here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500

learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 6)

Is Multi-Label? False
maxlen is 500
done.


In [11]:
#Essentially fit is a very basic training loop, whereas fit one cycle uses the one cycle policy callback

learner.fit_onecycle(lr = 2e-5, epochs = 10)

predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('bert')



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/10
 23/674 [>.............................] - ETA: 1:38:47 - loss: 0.6985 - accuracy: 0.5072

KeyboardInterrupt: 