In [4]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb


def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    files = glob(file_pattern)    
    
    dfs = [jsonl_to_dataframe(file) for file in files]
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

file_pattern = "data/*.jsonl"

df = merge_jsonl_to_dataframe(file_pattern)

human_df = pd.DataFrame({'answers': df['human_answers'], 'is_human': 1})
gpt_df = pd.DataFrame({'answers': df['chatgpt_answers'], 'is_human': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

answers_df['len'] = answers_df['answers'].apply(len)
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'])

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def preprocess(text_column):
    new_review = []
    for review in text_column:
        text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(review).lower()).strip()
        text = [wnl.lemmatize(i) for i in text.split(' ') if i not in stop_words]
        new_review.append(' '.join(text))
    return new_review

answers_df["answers"] = preprocess(answers_df["answers"])

[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ok


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

X = answers_df["answers"]
y = answers_df["is_human"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = TfidfVectorizer()
cv.fit(answers_df["answers"])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [25]:
import multiprocessing
from sklearn.model_selection import GridSearchCV


model = xgb.XGBClassifier(n_jobs=1, max_depth=6, n_estimators=500)

"""
n_estimators_list = [1, 5, 10, 20] + [20 * i for i in range(2, 11)] + [50 * i for i in range(5, 11)]
param_grid = {
    "max_depth": [6],
    "n_estimators": [500],
}

clf = GridSearchCV(
        model,
        param_grid,
        verbose=1,
        n_jobs=2,
    )
"""

ModuleNotFoundError: No module named 'concrete'

In [13]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9903452311293154

In [None]:
import pickle

filename = "XGB2.pickle"
pickle.dump(model, open(filename, "wb"))

In [17]:
#print(model.best_params_)
test = cv.transform(["Hello it is the IA"])
model.predict(test)

array([1])

In [23]:
test_answers = [

    ('Who is the strongest between Itachi and Jiraya?', 
    'Itachi is stronger by far an I can prove it ', 
    """It's hard to determine conclusively who is stronger between Itachi and Jiraiya.
     Both have unique strengths and weaknesses. Itachi excels in Sharingan and genjutsu mastery,
      while Jiraiya is a proficient user of ninjutsu and senjutsu. Their relative power depends on various factors, 
      including their respective skills, combat strategies, and physical/mental condition during battle."""
    )
]

def human_or_gpt(n):
    label = "HUMAN" if n == 1 else 'GPT'
    return label


def testsuite(model, tests):
    for test in tests:
        (qst, human, gpt) = test
        print(f"Question : {qst}")
        print(f"Human Answer: [{human[:25]}] the model thinks it was written by a {human_or_gpt(model.predict(cv.transform([human])))} ")
        print(f"GPT Answer: [{gpt[:25]}] the model thinks it was written by a {human_or_gpt(model.predict(cv.transform([gpt])))} ")

testsuite(model, test_answers)

Question : Who is the strongest between Itachi and Jiraya?
Human Answer: [Itachi is stronger by far] the model thinks it was written by a HUMAN 
GPT Answer: [It's hard to determine co] the model thinks it was written by a HUMAN 
