In [42]:
import pandas as pd
import json
from glob import glob
from sklearn.model_selection import train_test_split

from colorama import init, Fore, Style

In [43]:
def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

def merge_jsonl_to_dataframe(file_pattern):
    """Fonction pour fusionner plusieurs fichiers JSONL en un seul DataFrame pandas"""
    # Récupérer la liste des fichiers correspondant au modèle fourni
    files = glob(file_pattern)
    
    
    # Lire chaque fichier JSONL et stocker les DataFrames individuels dans une liste
    dfs = [jsonl_to_dataframe(file) for file in files]
    # Concaténer les DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)
    
    return merged_df

# Modèle de chemin vers les fichiers JSONL à fusionner
file_pattern = "data/*.jsonl"

# Appel de la fonction pour fusionner les fichiers JSONL en un DataFrame
df = merge_jsonl_to_dataframe(file_pattern)


#  Setup the dataFrame for the predictive models

In [55]:
# Creating the answers_df
human_df = pd.DataFrame({'questions': df['question'], 'answers': df['human_answers'], 'is_human': 1 })
gpt_df = pd.DataFrame({'questions': df['question'], 'answers': df['chatgpt_answers'], 'is_human': 0})
answers_df = pd.concat([human_df, gpt_df], axis=0, ignore_index=True)

csv = pd.read_csv("data/hack_train.csv")
csvFull = csv.rename(columns={'text': 'answers', "label" : "is_human"})
csv = csvFull.drop(columns="src")
answers_df = pd.concat([answers_df, csv])
# Checking the len
answers_df = answers_df.explode('answers', ignore_index=True)
answers_df = answers_df.dropna(subset=['answers'], ignore_index=True)

answers_df

Unnamed: 0,questions,answers,is_human
0,"Why is every book I hear about a "" NY Times # ...","Basically there are many categories of "" Best ...",1
1,"Why is every book I hear about a "" NY Times # ...","If you 're hearing about it , it 's because it...",1
2,"Why is every book I hear about a "" NY Times # ...","One reason is lots of catagories . However , h...",1
3,"If salt is so bad for cars , why do we use it ...",salt is good for not dying in car crashes and ...,1
4,"If salt is so bad for cars , why do we use it ...","In Minnesota and North Dakota , they tend to u...",1
...,...,...,...
227712,,We consider the recovery of a source term f (x...,1
227713,,"Self-supervised learning (SlfSL), aiming at le...",1
227714,,Recurrent neural networks (RNNs) have achieved...,1
227715,,Deep reinforcement learning (DRL) is a booming...,1


In [45]:
answers_df['nb_spelling_mistakes'] = answers_df['answers'].apply(count_spelling_errors)

NameError: name 'count_spelling_errors' is not defined

In [57]:
X = answers_df['answers']
y = answers_df['is_human']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

227717


# Small testsuite

In [46]:
test_answers = [

    ('Who is the strongest between Itachi and Jiraya?', 
    'Itachi is stronger by far an I can prove it ', 
    """It's hard to determine conclusively who is stronger between Itachi and Jiraiya.
     Both have unique strengths and weaknesses. Itachi excels in Sharingan and genjutsu mastery,
      while Jiraiya is a proficient user of ninjutsu and senjutsu. Their relative power depends on various factors, 
      including their respective skills, combat strategies, and physical/mental condition during battle."""
    )
]

def human_or_gpt(n):
    label = "HUMAN" if n == 1 else 'GPT'
    return Fore.BLUE + label + Style.RESET_ALL


def testsuite(model, tests):
    for test in tests:
        (qst, human, gpt) = test
        print(f"Question : {qst}")
        print(f"Human Answer: [{human[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([human]))} ")
        print(f"GPT Answer: [{gpt[:25]}] the model thinks it was written by a {human_or_gpt(model.predict([gpt]))} ")


In [47]:
from spellchecker import SpellChecker

def count_spelling_errors(text):
    spell = SpellChecker(language='en')
    
    # Diviser le texte en mots
    words = text.split()
    
    # Compter les fautes d'orthographe
    spelling_errors = 0
    for word in words:
        # Vérifier si le mot est mal orthographié
        if not spell.correction(word) == word:
            spelling_errors += 1
            
    return spelling_errors

ModuleNotFoundError: No module named 'indexer'

In [None]:
text = "This"
errors = count_spelling_errors(text)
print("Nombre de fautes d'orthographe en anglais :", errors)


Nombre de fautes d'orthographe en anglais : 0
