In [1]:
import re, os

In [3]:
class Extragatorinatorul:
    def __init__(self, DIACRITICS_THRESHOLD, allowedCharacters):
        self.DIACRITICS_THRESHOLD = DIACRITICS_THRESHOLD
        self.allowedCharacters = allowedCharacters
        self.romanian_count = 0
        self.english_count = 0
        self.cut_before_count = 0
        self.cut_after_count = 0
    
    def is_romanian(self, text):
        romanian_chars = 'ăâîșțĂÂÎȘȚ'
        count = sum(1 for char in text if char in romanian_chars)
        # print('Count:', count)
        return count > self.DIACRITICS_THRESHOLD

    def remove_not_allowed_chars(self, text):
        return ''.join(char for char in text if char in self.allowed_chars)
    
    def cut_before_introduction(self, text):
        match = re.search(r'\*\*introducere\*\*', text, flags=re.IGNORECASE) 
        match = match or re.search(r'# introducere', text, flags=re.IGNORECASE)
        match = match or re.search(r'\*\*argument', text, flags=re.IGNORECASE)
        match = match or re.search(r'# argument', text, flags=re.IGNORECASE)
        
        if match:
            self.cut_before_count += 1
            return text[match.start() - 1:]
        return text
    
    def cut_after_bibliography(self, text):
        match = re.search(r'\*\*bibliografie[:]*\*\*', text, flags=re.IGNORECASE) 
        match = match or re.search(r'[#]+ bibliografie', text, flags=re.IGNORECASE)
        match = match or re.search(r'\*\*REFERINȚE BIBLIOGRAFICE[:]*\*\*', text, flags=re.IGNORECASE)
        match = match or re.search(r'[#]+ REFERINȚE BIBLIOGRAFICE', text, flags=re.IGNORECASE)
            
        if match:
            self.cut_after_count += 1
            return text[:match.start()]
        return text
    
    def replace_white_chars_with_space(self, text):
        text = re.sub(r'\s+', ' ', text)
        return text
    
    def remove_page_separators(self, text):
        pattern = r"\n*\d*\n*-----\n"
        cleaned_text = re.sub(pattern, "\n", text)
        return cleaned_text
    
    
    def replace_similar_chars(self, text):
        text = text.replace('ş', 'ș')
        text = text.replace('ţ', 'ț')
        text = text.replace('Ş', 'Ș')
        text = text.replace('Ţ', 'Ț')
        return text
    
    def remove_table_captions(self, text):
        # Tabelul 2.1: Caracteristici ale materialelor piezoelectrice [5]

        pattern = r'\n\*?(?:Tabelul|Tabel|Table|Tab)[^\n]+\n'
        cleaned_text = re.sub(pattern, '\n', text, flags=re.IGNORECASE)
        
        # Remove any double newlines that might result from removal
        cleaned_text = re.sub(r'\n\n+', '\n\n', cleaned_text)
        
        return cleaned_text
    
    def remove_figure_captions(self, text):
        # Fig. 1.9. [title] or Figura 1.9: [title] or Grafic ...
        pattern = r'\n\*?(?:Fig|Figura|Grafic|Graphic|Legenda|Ex|Sursa)[^\n]+\n'
        cleaned_text = re.sub(pattern, '\n', text, flags=re.IGNORECASE)
        
        
        # Remove any double newlines that might result from removal
        cleaned_text = re.sub(r'\n\n+', '\n\n', cleaned_text)
        
        return cleaned_text
    
    def remove_sup_tags(self, text):
        
        # Remove <sup> at the end of the page
        text = re.sub(r'[\n][ ]?<sup>[^<]*</sup>[^\n]*\n', '', text)
        
        # Remove <sup> tags from texts
        text = re.sub(r'([^\n])<sup>[^<]*</sup>', r'\1', text)
            
        return text
    
    def remove_span_tags(self, text):
        # Remove <span> tags from texts
        cleaned_text = re.sub(r'<span[^<]*</span>', '', text)
            
        return cleaned_text
    
    # # def remove_titles_markdown(self, text):
    #     # Remove titles that are formatted with #
    #     cleaned_text = re.sub(r'^[#]+[ ]*', '', text, flags=re.MULTILINE)
        
    #     # Remove titles that are formatted with **
    #     cleaned_text = re.sub(r'\*\*', '', cleaned_text)
        
    #     return cleaned_text
    
    def remove_markdown(self, text):
        # Remove markdown formatting
        cleaned_text = re.sub(r'\*\*', '', text)
        cleaned_text = re.sub(r'\*', '', cleaned_text)
        
        # Remove digits in square brackets (e.g., [1], [2], [12,13] etc.)
        cleaned_text = re.sub(r' *\[[\d, \\\(\)\[\].-]+\]', '', cleaned_text)
        
        # Remove square brackets but keep the content (e.g., [link text] -> link text)
        cleaned_text = re.sub(r'\[(.*?)\]', r'\1', cleaned_text)
        
        return cleaned_text
    
    def remove_page_references(self, text):
        text = re.sub(r'\(#page-\d+-\d+\)', '', text)
        
        return text
    
    def remove_bad_break_lines(self, text):
        # Remove pattern not punctuation sign new line new line
        pattern = r'([^\s\.\!\?])\n\n([^\s\.\!\?])'
        cleaned_text = re.sub(pattern, r'\1 \2', text)
        return cleaned_text
    
    def remove_neclasificat(self, text):
        # remove "\nNECLASIFICAT [digits] din [digits]\n"
        pattern = r'\nNECLASIFICAT[ ]*\d+[ ]*din[ ]*\d+\n'
        cleaned_text = re.sub(pattern, '\n', text)
        # Remove any double newlines that might result from removal
        cleaned_text = re.sub(r'\n\n+', '\n\n', cleaned_text)
        return cleaned_text
    
    def remove_bad_page_numbers(self, text):
        #remove '\n'number'\n'
        text = re.sub(r'\n\d+\n', '\n', text)
    
        return text
        
    def parse(self, text):
        text = self.replace_similar_chars(text)
        
        if self.is_romanian(text):
            self.romanian_count += 1
        else:
            self.english_count += 1
            return ""
            
        # print('INITIAL:\n', text[:250], end='\n--\n')
        text = self.cut_before_introduction(text)
        # print('CUT BEFORE:\n', text[:250],end='\n--\n')
        # text = self.cut_after_bibliography(text)
        
        text = self.remove_page_references(text)
        text = self.remove_markdown(text)
        
        text = self.remove_span_tags(text)
        text = self.remove_sup_tags(text)
        text = self.remove_figure_captions(text)
        text = self.remove_table_captions(text)
        
        # SRI doctoral thesis have "Neclasificat" on every page
        text = self.remove_neclasificat(text)
        text = self.remove_bad_page_numbers(text)
        # text = self.remove_bad_break_lines(text)
        
        # print('CUT AFTER:\n', text[len(text)-250:],end='\n--\n')
        
        # text = self.remove_page_separators(text)
        
        # text = self.replace_alpha_newline_alpha_with_space(text)
        
        # text = self.replace_white_chars_with_space(text)
        # print('AFTER CLEAN=', text[:250],end='\n--\n')
        return text
    
    def print_counters(self):
        print('Romanian:', self.romanian_count)
        print('English:', self.english_count)
        print('Cut before:', self.cut_before_count)
        print('Cut after:', self.cut_after_count)

In [4]:
allowed_chars = ''.join(chr(i) for i in range(32, 127)) + 'ăâîșțĂÂÎȘȚ'
DIACRITICS_THRESHOLD = 1250

### single use of the script for testing

In [None]:
# file_path = '../mds_marker_0_500/2/2.md'
# with open(file_path, 'r', encoding='utf-8') as file:
#     text = file.read()
    
# extragtor = Extragatorinatorul(DIACRITICS_THRESHOLD, allowed_chars)
# text = extragtor.parse(text)

# if text == "":
#     print('Text is not in Romanian')
# else:
#     print('Text is in Romanian')
#     extragtor.print_counters()
#     print(text)

Text is in Romanian
Romanian: 1
English: 0
Cut before: 0
Cut after: 0
### MINISTERUL EDUCAȚIEI UNIVERSITATEA ,,VALAHIA" din TARGOVISTE IOSUD – ȘCOALA DOCTORALĂ DE ȘTIINȚE ECONOMICE ȘI UMANISTE DOMENIUL FUNDAMENTAL ȘTIINȚE ECONOMICE DOMENIUL CONTABILITATE

Digitally signed by Rodica-Mariana Ion Date: 2024.10.23 12:07:46 +03'00'

# TEZĂ DE DOCTORAT

## CONDUCĂTOR DE DOCTORAT, Conf. univ. dr. habil. Dan Marius COMAN

Doctorand,

Anca Daniela ȘENCHEA

TÂRGOVIȘTE 2024

# POSIBILITĂȚI DE UTILIZARE A INTELIGENȚEI ARTIFICIALE ÎN ACTIVITATEA DE AUDIT INTERN DIN CADRUL INSTITUȚIILOR PUBLICE

CONDUCĂTOR DE DOCTORAT, Conf. univ. dr. habil. Dan Marius COMAN

> Doctorand, Anca Daniela ȘENCHEA

TÂRGOVIȘTE 2024

## 1. CUPRINS

## 1. INTRODUCERE

În contextul evoluției rapide a tehnologiei și a schimbărilor continue din mediul de afaceri și din instituții publice, auditul intern deține un rol semnificativ în asigurarea transparenței, integrității și eficienței operaționale a acestora. Cu toate acestea,

### folder extraction, complete the folder path

In [7]:
def extract_folder(folder_path_input, folder_path_output):
    file_names = []
    try:
        file_names = [f for f in os.listdir(folder_path_input)]
    except Exception as e:
        print(f"An error occurred: {e}")

    
    print("Total file names=", len(file_names))
    extractor = Extragatorinatorul(DIACRITICS_THRESHOLD, allowed_chars)
    
    for file_name in file_names:        
        file_path_input = os.path.join(folder_path_input, file_name)
        if folder_path_input.find('marker') != -1:
            file_path_input = os.path.join(file_path_input, file_name + '.md')
        
        if not os.path.exists(file_path_input):
            print(f"File not found: {file_path_input}")
            continue
        
        with open(file_path_input, 'r', encoding='utf-8') as file:
            text = file.read()
        
        extracted_text = extractor.parse(text)
        
        if extracted_text == "":
            continue
        
        file_path_output = os.path.join(folder_path_output, file_name)
        
        if not file_path_output.endswith('.md'):
            file_path_output += '.md'

        with open(file_path_output, 'w', encoding='utf-8') as file:
            file.write(extracted_text)
        
    extractor.print_counters()

In [8]:
extract_folder('../mds_marker_0_500', '../mds_marker_0_500_clean')

Total file names= 490
Romanian: 367
English: 123
Cut before: 287
Cut after: 0


# old code

In [None]:
def extract_folder(folder_path_input, folder_path_output):
    file_names = []
    try:
        file_names = [f for f in os.listdir(folder_path_input)]
    except Exception as e:
        print(f"An error occurred: {e}")

    
    print("Total file names=", len(file_names))
    extractor = Extragatorinatorul(DIACRITICS_THRESHOLD, allowed_chars)
    
    for file_name in file_names:
        file_path_input = os.path.join(folder_path_input, file_name)
        print(file_path_input)
        
        with open(file_path_input, 'r', encoding='utf-8') as file:
            text = file.read()
        
        extracted_text = extractor.parse(text)
        
        if extracted_text == "":
            continue
        
        file_path_output = os.path.join(folder_path_output, file_name)
        with open(file_path_output, 'w', encoding='utf-8') as file:
            file.write(extracted_text)
        
    extractor.print_counters()

In [None]:
# import re

# DIACRITICS_THRESHOLD = 500
# extraction_cut_begin_success_count = 0
# extraction_cut_end_success_count = 0
# romanian_count = 0

# def is_romanian(text):
#     romanian_chars = 'ăâîșțĂÂÎȘȚ'
#     count = sum(1 for char in text if char in romanian_chars)
#     print('romanian characters: ', count)
#     return count > DIACRITICS_THRESHOLD

# def clean_text(text):
#     allowed_chars = ''.join(chr(i) for i in range(32, 127)) + 'ăâîșțĂÂÎȘȚ'
#     return ''.join(char for char in text if char in allowed_chars)

# def extract_text(file_path, min_length=0):
#     global extraction_cut_begin_success_count, extraction_cut_end_success_count, romanian_count
    
#     with open(file_path, 'r', encoding='utf-8') as file:
#         text = file.read()
    
    
#     if is_romanian(text):
#         start_word = r'introducere\s'
#         end_word = r'bibliografie\s'
#         print("Detected language: Romanian")
#         romanian_count += 1
#     else:
#         start_word = r'introduction\s'
#         end_word = r'bibliography\s'
#         print("Detected language: English")

#     end_split = re.split(end_word, text, flags=re.IGNORECASE)
#     if len(end_split) > 1:
#         # print("End cut position:", text.rfind(end_split[-1]))
#         text = text[:text.rfind(end_split[-1])]
#         extraction_cut_end_success_count += 1
    
#     start_split = re.split(start_word, text, flags=re.IGNORECASE)
#     if len(start_split) > 1:
#         # print("Start cut position:", text.find(start_split[1]))
#         text = start_split[1]
#         extraction_cut_begin_success_count += 1
        
#     text = clean_text(text)

#     return text

# # # Example usage
# file_path = '../testdoc1/11050_teza.txt'
# text = extract_text(file_path)
# print(text)

# # with open(file_path[:-4] + '_output.txt', 'w', encoding='utf-8') as file:
# #     file.write(text)



In [None]:
# import os

# # extract text for the files from a directory
# def extract_folder(folder_path_input, folder_path_output):
#     global extraction_cut_begin_success_count, extraction_cut_end_success_count, romanian_count
#     extraction_cut_begin_success_count = 0
#     extraction_cut_end_success_count = 0
#     romanian_count = 0
#     try:
#         file_names = [f for f in os.listdir(folder_path_input)]
#     except Exception as e:
#         print(f"An error occurred: {e}")

    
#     print("Total file names=", len(file_names))
    
#     for file_name in file_names:
#         file_path_input = os.path.join(folder_path_input, file_name)
#         print(file_path_input)
#         extracted_text = extract_text(file_path_input)
        
#         file_path_output = os.path.join(folder_path_output, file_name)
#         with open(file_path_output, 'w',  encoding='utf-8') as file:
#             file.write(extracted_text)
        
#     print("Total file names=", len(file_names))
#     print("extraction_cut_begin_success_count=", extraction_cut_begin_success_count)
#     print("extraction_cut_end_success_count=", extraction_cut_end_success_count)
#     print("romanian_count = ", romanian_count)
    
# extract_folder('../testdoc1', '../testdoc1_clean')

