In [44]:
import os
import zipfile
import glob
import tarfile
import lzma
from tqdm import tqdm
import concurrent.futures
import random


In [9]:
def unzip_file(zip_filepath, dest_dir):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            zip_ref.extractall(dest_dir)
        print(f"Files extracted successfully to {dest_dir}")
    except FileNotFoundError:
        print(f"The file {zip_filepath} does not exist.")
    except PermissionError:
        print(f"Permission denied for reading the file {zip_filepath} or writing to the directory {dest_dir}.")
    except zipfile.BadZipFile:
        print(f"The file {zip_filepath} is not a zip file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Usage
# unzip_file('../compressed/subsets-20240309T104346Z-002.zip', '../uncompressed')


In [11]:
file_list = glob.glob('../compressed/*.zip')
for file in file_list:
    f_name = file.split("\\")[-1]    
    unzip_file(f'../compressed/{f_name}', '../uncompressed')

Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed
Files extracted successfully to ../uncompressed


In [14]:
def extract_tar_file(tar_filepath, dest_dir):
    try:
        with tarfile.open(tar_filepath, 'r') as tar_ref:
            tar_ref.extractall(dest_dir)
        print(f"Files extracted successfully to {dest_dir}")
    except FileNotFoundError:
        print(f"The file {tar_filepath} does not exist.")
    except PermissionError:
        print(f"Permission denied for reading the file {tar_filepath} or writing to the directory {dest_dir}.")
    except tarfile.ReadError:
        print(f"The file {tar_filepath} is not a tar file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Usage
# extract_tar_file('path_to_your_tar_file.tar', 'destination_directory')


In [16]:
file_list = glob.glob('../uncompressed/subsets/*.tar')
for file in file_list:
    f_name = file.split("\\")[-1] 
    print(f_name)
    extract_tar_file(f'../uncompressed/subsets/{f_name}', '../data')

urlsf_subset00.tar
Files extracted successfully to ../data
urlsf_subset01.tar
Files extracted successfully to ../data
urlsf_subset02.tar
Files extracted successfully to ../data
urlsf_subset03.tar
Files extracted successfully to ../data
urlsf_subset04.tar
Files extracted successfully to ../data
urlsf_subset05.tar
Files extracted successfully to ../data
urlsf_subset06.tar
Files extracted successfully to ../data
urlsf_subset07.tar
Files extracted successfully to ../data
urlsf_subset08.tar
Files extracted successfully to ../data
urlsf_subset09.tar
Files extracted successfully to ../data
urlsf_subset10.tar
Files extracted successfully to ../data
urlsf_subset11.tar
Files extracted successfully to ../data
urlsf_subset12.tar
Files extracted successfully to ../data
urlsf_subset13.tar
Files extracted successfully to ../data
urlsf_subset14.tar
Files extracted successfully to ../data
urlsf_subset15.tar
Files extracted successfully to ../data
urlsf_subset16.tar
Files extracted successfully to ../da

In [18]:
# def decompress_xz_file(xz_filepath, dest_filepath):
#     try:
#         with lzma.open(xz_filepath, 'rb') as f_in, open(dest_filepath, 'wb') as f_out:
#             f_out.write(f_in.read())
#         print(f"File decompressed successfully to {dest_filepath}")
#     except FileNotFoundError:
#         print(f"The file {xz_filepath} does not exist.")
#     except PermissionError:
#         print(f"Permission denied for reading the file {xz_filepath} or writing to the file {dest_filepath}.")
#     except lzma.LZMAError:
#         print(f"The file {xz_filepath} is not a xz file.")
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
# 
# # Usage
# # decompress_xz_file('path_to_your_xz_file.xz', 'destination_file')


In [48]:
folder_path = "../data/openwebtext"
output_file_train = "../data/output_train.txt"
output_file_val = "../data/output_val.txt"
vocab_file = "../data/vocab.txt"

In [49]:
def xz_files_in_dir(directory):
    return [filename for filename in os.listdir(directory) if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename))]

In [55]:
files = xz_files_in_dir(folder_path)
total_files = len(files)
# print(total_files)
split_index = int(total_files * 0.9)  # 90% for training
files_train = files[:split_index]
files_val = files[split_index:]

# Sampling a hundredth of the files for each split
sample_rate = 0.01
files_train_sampled = random.sample(files_train, max(1, int(len(files_train) * sample_rate)))
files_val_sampled = random.sample(files_val, max(1, int(len(files_val) * sample_rate)))
# print(files_train_sampled)
# print(files_val_sampled)
# Ensure output files are empty before appending
open(output_file_train, 'w').close()
open(output_file_val, 'w').close()



In [56]:
# def process_file(args):
#     print(1)
#     directory, filename, output_file, vocab = args
#     print(2)
#     file_path = os.path.join(directory, filename)
#     with lzma.open(file_path, "rt", encoding="utf-8") as infile:
#         text = infile.read()
#     with open(output_file, "a", encoding="utf-8") as outfile:
#         outfile.write(text)
#     characters = set(text)
#     return characters


def process_files_in_parallel(files, folder_path, output_file):
    vocab = set()
    args = [(folder_path, filename, output_file, vocab) for filename in files]
    # with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:        
    #     # print(args)
    #     for characters in tqdm(executor.map(process_file, args), total=len(files)):
    for (folder_path, filename, output_file, vocab) in args:
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
        with open(output_file, "a", encoding="utf-8") as outfile:
            outfile.write(text)
        characters = set(text)
    # return characters
        vocab.update(characters)
    return vocab

In [57]:
# Process the sampled training files
vocab_train = process_files_in_parallel(files_train_sampled, folder_path, output_file_train)

# Process the sampled validation files
vocab_val = process_files_in_parallel(files_val_sampled, folder_path, output_file_val)

# Combine vocabularies (if needed) and write to vocab.txt
vocab = vocab_train.union(vocab_val)
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in sorted(vocab):
        vfile.write(char + '\n')