## Esta parte trata da conversão das entradas do Dataset original para uma série de linhas salvas em um grupo de arquivos .txt localizados na mesma pasta.

In [1]:
#Definição das Variáveis Básicas do Script

file_index = 0
line_index = 0
header_word_limit = 7
line_limit_per_file = 2000000
checkpoint = 'bert_base_cased'
base_file_name = "wikipedia_line_file"
dataset_folder_path = 'wikipedia_lines_files\\'

In [2]:
import gc
import os
import random
import re
import shutil
from datasets import load_dataset
from tqdm.auto import tqdm

In [3]:
wikipedia_dataset = load_dataset("wikimedia/wikipedia", "20231101.en")
max_dataset_index = len(wikipedia_dataset['train'])

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [4]:
def step3_pre_process_lines(lines):
    
    step3_lines = []
    
    colon_line_flag = False
    text_after_colon_flag = False

    header_found = False
    post_header_text_found = False
    header_list_entry_found = False
    possible_lines = []
    
    for line in lines:
        
        # 1º Possibilidade
        if(len(line) > 0 and line[0] == " "):
            if(colon_line_flag):
                text_after_colon_flag = True
            continue

        # 2º Possibilidade
        if(colon_line_flag):
            if(text_after_colon_flag):
                if(len(line) == 0):
                    colon_line_flag = False
                    text_after_colon_flag = False
            elif(len(line) > 0):
                text_after_colon_flag = True
            continue
        if(len(line) > 0 and line[-1] == ":"):
            colon_line_flag = True
        
        # 3º Possibilidade
        if(header_found):
            if(not header_list_entry_found):
                if(post_header_text_found):
                    if(len(line) == 0):
                        step3_lines += possible_lines
                        # Resetando Variáveis da 3º Possibilidade
                        header_found = False
                        post_header_text_found = False
                        header_list_entry_found = False
                        possible_lines = []
                    else:
                        header_list_entry_found = add_possible_header_entry_to_list(possible_lines, line)
                        continue
                else:
                    post_header_text_found = len(line) > 0
                    if(post_header_text_found):
                        header_list_entry_found = add_possible_header_entry_to_list(possible_lines, line)
                    continue
            else:
                if(len(line) == 0):
                    # Resetando Variáveis da 3º Possibilidade
                    header_found = False
                    post_header_text_found = False
                    header_list_entry_found = False
                    possible_lines = []
                continue
        else:
            header_found = len(line) > 0 and (not does_line_exceed_word_limit(line, header_word_limit))
        
        step3_lines.append(line)
    return step3_lines

def add_possible_header_entry_to_list(target_list, line):
        target_list.append(line)
        return not does_line_exceed_word_limit(line, header_word_limit)

In [5]:
def does_line_exceed_word_limit(line, word_limit):
    words = re.split(' ', line)
    return len(words) > word_limit

In [6]:
def get_entry_lines(entry_index):
    text = wikipedia_dataset['train'][entry_index]['text']
    lines = re.split('\n', text)
    return lines

def pre_process_lines(lines):
    # Passo 1
    step1_lines = []
    for line in lines:
        if(("References" in line) == False and ("See also" in line) == False and ("External links" in line) == False):
            step1_lines.append(line)
        else:
            break

    # Passo 2
    step2_lines = []
    for line in step1_lines:
        if(len(line) == 0 or (line[0] != "|" and line[0] != "{" and line[0] != "!")):
            step2_lines.append(line)

    # Passo 3
    step3_lines = step3_pre_process_lines(step2_lines)

    # Passo 4
    step4_lines = []
    for line in step3_lines:
        if(does_line_exceed_word_limit(line, header_word_limit)):
            step4_lines.append(line)
    
    return step4_lines

In [7]:
def write_lines_to_file(lines, base_file_name, file_index = 0, line_index = 0, line_limit_per_file=160000):
    file = open(base_file_name + str(file_index) + ".txt", "a", encoding='utf-8')
    for line in lines:
        file.write(line)
        file.write("\n")
        line_index += 1
        if(line_index >= line_limit_per_file):
            line_index = 0
            file.close()
            file_index += 1
            file = open(base_file_name + str(file_index) + ".txt", "a", encoding='utf-8')
    file.close()
    return line_index, file_index

In [8]:
def clear_dataset_folder(folder_path):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        try:
            if(os.path.isfile(file_path) or os.path.islink(file_path)):
                os.unlink(file_path)
            elif(os.path.isdir(file_path)):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Deleção de objeto falhou: %s. Mensagem de erro: %s' % (file_path, e))

In [9]:
clear_dataset_folder(dataset_folder_path)

In [None]:
progress_bar = tqdm(range(max_dataset_index))
for dataset_index in range(0, max_dataset_index):
    line_index, file_index = write_lines_to_file(
        pre_process_lines(get_entry_lines(dataset_index)),
        dataset_folder_path + base_file_name,
        file_index = file_index,
        line_index = line_index,
        line_limit_per_file = line_limit_per_file)
    progress_bar.update()