> `Text preprocessing script`
> ===


Importing packages for it


In [1]:
import string
import os
import json

>## Etapes que nous suivrons 
>- Choisir une langue pour la quelle nous souhaitons netoyer les données
>- Charger les ID de chapitres et livres de cette langue
>- parcourir les dossiers des livres et ouvrir les dossiers de chaque livre et les preprocess
>-  ...

Defining constants of the script


In [16]:
LANG = 63



# root folder for not processed data
DATA_IN_PATH = os.path.join(os.getcwd(), f'data/{LANG}')

# The file where we store the not scrapped chapters
NOT_SCRAPED_CHAPTERS = 'not_scraped_chapters.json'

# root folder for processed data
DATA_OUT_PATH = os.path.join(os.getcwd(), f'preprocessed_data/{LANG}')
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

#  The json file where we will store all the words of the language as a list
VOCABULARY_FILE_PATH = os.path.join(DATA_OUT_PATH,'meta/vocabulary.json')


# Sessions path
PREPROCESSING_SESSIONS_PATH = os.path.join(os.getcwd(),'preprocessing_sessions.json')
if not os.path.isfile(PREPROCESSING_SESSIONS_PATH):
    with open(PREPROCESSING_SESSIONS_PATH, 'w', encoding='utf-8') as sessions_file:
        sessions_file.write('{}')

SCRAPING_SESSIONS_PATH = os.path.join(os.getcwd(),'sessions.json')
CHAPTERS_FILE_PATH = os.path.join(os.getcwd(), f'utils/chapitres_id/{LANG}.json')



Defining functions tools

In [21]:

# dossier dans lequel on va mettre les dossiers chaque chapitre pour cette langue
def get_lang_folder_path(root:str = DATA_OUT_PATH) -> str:
    path = os.path.join(os.getcwd(), root)
    if not os.path.exists(path):
        os.makedirs(path)
    return path


# dossier dans lequel on va mettre les fichiers txt pour chaqe chap du livre livre
def get_book_folder_path_for_lang(book: str, root:str = DATA_OUT_PATH) -> str:
    path = os.path.join(
        os.getcwd(), f"{root}/{book}").replace('\\', '/')
    if not os.path.exists(path):
        print(f'Path to saves folder created : {path} ')
        os.makedirs(path)
    return path


def update_json_file(data,  file_path: str, type: type = list, lang: int = LANG):
    def write_default_content(): return file.write(
        '[]') if type == list else file.write('{}')
    path = os.path.join(os.getcwd(), file_path)
    # initializing the file if not exists
    if not os.path.exists(path):
        with open(path, 'w', encoding='utf-8') as file:
            write_default_content()
            # getting the data in the file and updating it
    with open(path, 'r+', encoding='utf-8') as file:
        if file.read().strip() == '':
            write_default_content()
        file_data = json.load(file)
        if type == list:  # si nous sommes dans un liste
            file_data.append(data)
        elif type == dict[list]:
            file_data[f'{lang}'].append(data)
        else:  # si c'est un dictionnaire plutot
            file_data[f'{lang}'] = data
    # write the final data in the file
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(file_data, file, indent=4)


In [20]:
with open(NOT_SCRAPED_CHAPTERS, 'r', encoding='utf-8') as file:
    data = file.read().strip()

data

''

In [4]:
def save_current_session(book: str, chapter: str, completed_books: list[str], path=PREPROCESSING_SESSIONS_PATH, lang: int = LANG):

    session = {
        "last_book": book,
        "last_chapter": chapter,
        "completed_books": completed_books
    }
    update_json_file(session, path, dict)


def load_last_session(lang: int = LANG, path: str = PREPROCESSING_SESSIONS_PATH):
    default_session = {
        "last_book": '',
        "last_chapter": '',
        "completed_books": []
    }
    if not os.path.exists(path):
        with open(path, 'w', encoding='utf-8') as sessions_file:
            json.dump({lang: default_session}, sessions_file, indent=4)
        return default_session

    with open(path, 'r', encoding='utf-8') as sessions_file:
        sessions = json.load(sessions_file)
        try:
            return sessions[f'{lang}']
        except KeyError:
            return default_session


def load_scrapped_books(lang: str = LANG):
    last_scraping_session = load_last_session(path=SCRAPING_SESSIONS_PATH)
    books = last_scraping_session['completed_books']
    if len(books) == 0:
        print("Aucun livre à priorit disponible pour cette langue, veuillez lancer le script de scraping pour cela avant de revenir au preprocessing")
        exit()
    return books



def load_all_chapters(lang:str=LANG):
    with open(CHAPTERS_FILE_PATH, 'r', encoding='utf-8') as chapters_file:
        lang_chapters = json.load(chapters_file)
        
    return lang_chapters
    
def json_print(message, json_doc):
    print(message)
    print(json.dumps(json_doc, indent=4, separators=(',', ': ')))


In [9]:
# loading all scrapped books, if there is no scrapped book, we will exit the application
scraped_books = load_scrapped_books()
# Loading all the chapters for each existing books of the lang
books_chapters:dict[list] = load_all_chapters()
# Loading the last preprocessing session
last_session:list = load_last_session()


Now we can start 

In [25]:
# La chaine de transformations qu'on va operer sur nos textes
def ponctuation_transformers(text:str)->str:
    result = text
    # Isoler les ponctuations
    for punctuation in string.punctuation:
        result = result.replace(punctuation, f' {punctuation} ')

    # removing spaces before and after the text
    result.strip()

    # S'assurer que la phrase se termine par un point ou une ponctuation
    if result[-1] not in punctuation:
        result += ' .'
    
    # s'assurer qu'il n'y a pas des espaces collés
    result = result.replace('  ', ' ')
    return result
# This will help to get and store metadata as:
#   the len(number of words) of the longest string
#   the words dictionary
def words_transformers(text:str)->str:
    pass
    

### Application des transformers préalablement définis à tout le contenu
------
pour l'utiliser il suffit d'avoir des `transformers` (les fonctions qui traiterons nos données et aussi) la liste des chapitres par livres `books_chapter`, la liste des livres scrapés pour la langue `scraped_books` et enfin, la derniere session de mise en forme des données pour la langue courante.

Il sera question de parcourir les fichiers des chapitres scrapés et à chaque ligne de chaque fichier, on appliquera le / les transformers passés en parametre puis on sauvegarde dans un nouveau fichier de sortie qui sera stocké  dans le repertoire `./preprocessed_data` selon la meme structuration que dans le dossier `./data`


In [10]:
def apply_transformers(transformers:list[function], books_chapters:dict[list[str]], scraped_books:list['str'], last_session:dict):
    # The last book of the previous session
    last_book:str = last_session['last_book']
    # The last chapter of the previous session
    last_chapter:str = last_session['last_chapter']
    # The list of the books that have alreaday been preproceeded 

    for book_name in books_chapters :
        if book_name not in scraped_books or book_name in last_session['completed_books']:
            continue

        book: list = books_chapters[book_name]
        book_folder_in = get_book_folder_path_for_lang(
            book_name, root=DATA_IN_PATH)
        book_folder_out = get_book_folder_path_for_lang(
            book_name, root=DATA_OUT_PATH)
        for chapter in book:
            if book_name == last_book and book.index(chapter) <= book.index(last_chapter):
                continue

            if not os.path.isfile(f'{book_folder_in}/{chapter}.txt'):
                update_json_file(
                    file_path=NOT_SCRAPED_CHAPTERS, type=dict[list])
                print(
                    f'The chapter {chapter} has not been scrapped, it will be added in journal')
                continue

            # Now we can open both scraping chapter file (input file) and transfromed text (output) and do the job
            with open(f'{book_folder_in}/{chapter}.txt', 'r', encoding='utf-8') as input_file, open(f'{book_folder_out}/{chapter}.txt', 'w', encoding='utf-8') as output_file:
                for line in input_file.readlines:
                    txt = line
                    for transformer in transformers:
                        txt = transformer(txt)
                    output_file.write(txt+'\n')
