In [17]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb


def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    files = glob(file_pattern)    
    
    dfs = [jsonl_to_dataframe(file) for file in files]
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

file_pattern = "data/*.jsonl"

df = merge_jsonl_to_dataframe(file_pattern)

human_df = pd.DataFrame({'answers': df['human_answers'], 'is_human': 1})
gpt_df = pd.DataFrame({'answers': df['chatgpt_answers'], 'is_human': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

answers_df['len'] = answers_df['answers'].apply(len)
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'])

In [18]:
import nltk
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def preprocess(text_column):
    new_review = []
    for review in text_column:
        text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(review).lower()).strip()
        text = [wnl.lemmatize(i) for i in text.split(' ') if i not in stop_words]
        new_review.append(' '.join(text))
    return new_review

answers_df["answers"] = preprocess(answers_df["answers"])

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
from sklearn.feature_extraction.text import CountVectorizer

X = answers_df["answers"]
y = answers_df["is_human"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = CountVectorizer(binary = True)
cv.fit(answers_df["answers"])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [27]:
import multiprocessing
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier(n_jobs=1)

n_estimators_list = [1, 5, 10, 20] + [20 * i for i in range(2, 11)] + [50 * i for i in range(5, 11)]
param_grid = {
    "max_depth": [2, 4, 6],
    "n_estimators": n_estimators_list,
}

clf = GridSearchCV(
        model,
        param_grid,
        verbose=1,
        n_jobs=2,
    )

In [28]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 57 candidates, totalling 285 fits




0.9866588648332358

In [29]:
import pickle

filename = "XGB.pickle"
pickle.dump(clf, open(filename, "wb"))