In [23]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split

from colorama import init, Fore, Style

In [24]:
def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    """Fonction pour fusionner plusieurs fichiers JSONL en un seul DataFrame pandas"""
    # Récupérer la liste des fichiers correspondant au modèle fourni
    files = glob(file_pattern)
    
    
    # Lire chaque fichier JSONL et stocker les DataFrames individuels dans une liste
    dfs = [jsonl_to_dataframe(file) for file in files]
    # Concaténer les DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

# Modèle de chemin vers les fichiers JSONL à fusionner
file_pattern = "data/*.jsonl"

# Appel de la fonction pour fusionner les fichiers JSONL en un DataFrame
df = merge_jsonl_to_dataframe(file_pattern)

# Afficher le DataFrame fusionné


#  Setup the dataFrame for the predictive models

In [25]:
# Creating the answers_df
human_df = pd.DataFrame({'questions': df['question'], 'answers': df['human_answers'], 'is_human': 1 })
gpt_df = pd.DataFrame({'questions': df['question'], 'answers': df['chatgpt_answers'], 'is_human': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

# Checking the len
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'], ignore_index=True)

answers_df

Unnamed: 0,questions,answers,is_human
0,Does Primolut N taken during pregnancy affect ...,"Hi, Thanks for the query. I understand you...",1
1,Bloating and pain on right lower abdomen. Shou...,"Hello,Thanks for the query to H.C.M. Forum.Pai...",1
2,Is chest pain related to intake of clindamycin...,"Hello, The use of Clindamycin can cause stomac...",1
3,Q. Noticed a yellowish sag in the gums of my 1...,Hello. Revert back with the photos to a dentis...,1
4,"Suggest remedy for low grade fever, hot and co...","Hi Dear,Welcome to Healthcaremagic Team.Unders...",1
...,...,...,...
170893,what are layers of the ionosphere,The ionosphere is a layer of the Earth's upper...,0
170894,what are layers of the ionosphere,The ionosphere is a layer of Earth's atmospher...,0
170895,what state is new orleans in,New Orleans is a city located in the state of ...,0
170896,what state is new orleans in,New Orleans is a city located in the state of ...,0


In [28]:
answers_df['nb_spelling_mistakes'] = answers_df['answers'].apply(count_spelling_errors)

In [26]:
X = answers_df['answers']
y = answers_df['is_human']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Small testsuite

In [7]:
test_answers = [

    ('Who is the strongest between Itachi and Jiraya?', 
    'Itachi is stronger by far an I can prove it ', 
    """It's hard to determine conclusively who is stronger between Itachi and Jiraiya.
     Both have unique strengths and weaknesses. Itachi excels in Sharingan and genjutsu mastery,
      while Jiraiya is a proficient user of ninjutsu and senjutsu. Their relative power depends on various factors, 
      including their respective skills, combat strategies, and physical/mental condition during battle."""
    )
]

def human_or_gpt(n):
    label = "HUMAN" if n == 1 else 'GPT'
    return Fore.BLUE + label + Style.RESET_ALL


def testsuite(model, tests):
    for test in tests:
        (qst, human, gpt) = test
        print(f"Question : {qst}")
        print(f"Human Answer: [{human[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([human]))} ")
        print(f"GPT Answer: [{gpt[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([gpt]))} ")


In [19]:
from spellchecker import SpellChecker

def count_spelling_errors(text):
    spell = SpellChecker(language='en')
    
    # Diviser le texte en mots
    words = text.split()
    
    # Compter les fautes d'orthographe
    spelling_errors = 0
    for word in words:
        # Vérifier si le mot est mal orthographié
        if not spell.correction(word) == word:
            spelling_errors += 1
            
    return spelling_errors

In [21]:
text = "This"
errors = count_spelling_errors(text)
print("Nombre de fautes d'orthographe en anglais :", errors)


Nombre de fautes d'orthographe en anglais : 0
