In [34]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb


def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    files = glob(file_pattern)    
    
    dfs = [jsonl_to_dataframe(file) for file in files]
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

file_pattern = "data/*.jsonl"

df = merge_jsonl_to_dataframe(file_pattern)

human_df = pd.DataFrame({'answers': df['human_answers'], 'is_human': 1})
gpt_df = pd.DataFrame({'answers': df['chatgpt_answers'], 'is_human': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

answers_df['len'] = answers_df['answers'].apply(len)
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'])

In [35]:
import nltk
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def preprocess(text_column):
    new_review = []
    for review in text_column:
        text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(review).lower()).strip()
        text = [wnl.lemmatize(i) for i in text.split(' ') if i not in stop_words]
        new_review.append(' '.join(text))
    return new_review

answers_df["answers"] = preprocess(answers_df["answers"])

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

X = answers_df["answers"]
y = answers_df["is_human"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = TfidfVectorizer()
cv.fit(answers_df["answers"])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [44]:
import multiprocessing
from sklearn.model_selection import GridSearchCV


model = xgb.XGBClassifier(n_jobs=1, max_depth=6, n_estimators=500)

"""
n_estimators_list = [1, 5, 10, 20] + [20 * i for i in range(2, 11)] + [50 * i for i in range(5, 11)]
param_grid = {
    "max_depth": [6],
    "n_estimators": [500],
}

clf = GridSearchCV(
        model,
        param_grid,
        verbose=1,
        n_jobs=2,
    )
"""

'\nn_estimators_list = [1, 5, 10, 20] + [20 * i for i in range(2, 11)] + [50 * i for i in range(5, 11)]\nparam_grid = {\n    "max_depth": [6],\n    "n_estimators": [500],\n}\n\nclf = GridSearchCV(\n        model,\n        param_grid,\n        verbose=1,\n        n_jobs=2,\n    )\n'

In [45]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


0.9903452311293154

In [40]:
import pickle

filename = "XGB2.pickle"
pickle.dump(clf, open(filename, "wb"))

In [46]:
print(clf.best_params_)
clf.predict("Hello it is the IA")

{'max_depth': 6, 'n_estimators': 500}


XGBoostError: [18:00:41] /home/conda/feedstock_root/build_artifacts/xgboost-split_1712072639327/work/src/data/file_iterator.cc:24: Check failed: name_args.size() == 2 (1 vs. 2) : URI parameter `format` is required for loading text data: filename?format=csv
Stack trace:
  [bt] (0) /opt/mamba/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x6e) [0x7fb98b8582ee]
  [bt] (1) /opt/mamba/lib/libxgboost.so(xgboost::data::ValidateFileFormat(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x971) [0x7fb98ba7db11]
  [bt] (2) /opt/mamba/lib/libxgboost.so(xgboost::DMatrix::Load(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool, xgboost::DataSplitMode)+0x821) [0x7fb98ba32ba1]
  [bt] (3) /opt/mamba/lib/libxgboost.so(XGDMatrixCreateFromURI+0xf1) [0x7fb98b8408a1]
  [bt] (4) /opt/mamba/lib/python3.11/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7fbc281cfa4a]
  [bt] (5) /opt/mamba/lib/python3.11/lib-dynload/../../libffi.so.8(+0x5fea) [0x7fbc281cefea]
  [bt] (6) /opt/mamba/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so(+0x12529) [0x7fbc281e7529]
  [bt] (7) /opt/mamba/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so(+0x8862) [0x7fbc281dd862]
  [bt] (8) /opt/mamba/bin/python(_PyObject_MakeTpCall+0x253) [0x56209aababd3]



In [47]:
test_answers = [

    ('Who is the strongest between Itachi and Jiraya?', 
    'Itachi is stronger by far an I can prove it ', 
    """It's hard to determine conclusively who is stronger between Itachi and Jiraiya.
     Both have unique strengths and weaknesses. Itachi excels in Sharingan and genjutsu mastery,
      while Jiraiya is a proficient user of ninjutsu and senjutsu. Their relative power depends on various factors, 
      including their respective skills, combat strategies, and physical/mental condition during battle."""
    )
]

def human_or_gpt(n):
    label = "HUMAN" if n == 1 else 'GPT'
    return Fore.BLUE + label + Style.RESET_ALL


def testsuite(model, tests):
    for test in tests:
        (qst, human, gpt) = test
        print(f"Question : {qst}")
        print(f"Human Answer: [{human[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([human]))} ")
        print(f"GPT Answer: [{gpt[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([gpt]))} ")

testsuite(clf, test_answers)

Question : Who is the strongest between Itachi and Jiraya?


XGBoostError: [18:00:47] /home/conda/feedstock_root/build_artifacts/xgboost-split_1712072639327/work/src/c_api/../data/array_interface.h:492: Unicode-4 is not supported.
Stack trace:
  [bt] (0) /opt/mamba/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x6e) [0x7fb98b8582ee]
  [bt] (1) /opt/mamba/lib/libxgboost.so(xgboost::ArrayInterface<2, false>::Initialize(std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, xgboost::Json, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, xgboost::Json> > > const&)+0x1f6) [0x7fb98b86de26]
  [bt] (2) /opt/mamba/lib/libxgboost.so(xgboost::data::DMatrixProxy::SetArrayData(xgboost::StringView)+0x117) [0x7fb98bacd267]
  [bt] (3) /opt/mamba/lib/libxgboost.so(XGBoosterPredictFromDense+0x2bb) [0x7fb98b8446db]
  [bt] (4) /opt/mamba/lib/python3.11/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7fbc281cfa4a]
  [bt] (5) /opt/mamba/lib/python3.11/lib-dynload/../../libffi.so.8(+0x5fea) [0x7fbc281cefea]
  [bt] (6) /opt/mamba/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so(+0x12529) [0x7fbc281e7529]
  [bt] (7) /opt/mamba/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so(+0x8862) [0x7fbc281dd862]
  [bt] (8) /opt/mamba/bin/python(_PyObject_MakeTpCall+0x253) [0x56209aababd3]

