In [1]:
import gzip
import shutil
import tarfile
import os

# Level 5 is a good balance
def compress_gzip(in_file_path, out_file_path, compresslevel=5):
    with open(in_file_path, 'rb') as f_in:
        with gzip.open(out_file_path, 'wb', compresslevel=compresslevel) as f_out:
            shutil.copyfileobj(f_in, f_out)

def compress_targz(in_file_path, out_file_path):
    with tarfile.open(out_file_path, "w:gz") as tar:
        tar.add(in_file_path, arcname=os.path.basename(in_file_path))

def decompress_gzip(in_file_path, out_file_path):
    with gzip.open(in_file_path, 'rb') as f_in:
        with open(out_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

def decompress_targz(in_file_path, out_file_path):
    decompressed_file_paths = []
    with tarfile.open(in_file_path, "r:gz") as tar:
        tar.extractall(path=out_file_path)
        for i in range(len(tar.members)):
            name = tar.members[i].name
            decompressed_file_paths.append(os.path.join(out_file_path, name))
    
    return decompressed_file_paths

def get_multiprocessing_pool(num_processes=None, gb_per_process=None):
    # num_processes : the number of processes, if None the calculation is done based on the gb_per_process limit
    # gb_per_process : the expected amount of ram one process needs, if None there is no limit

    cpu_count = multiprocessing.cpu_count()
    print("Number of cpu : ", cpu_count)
    mem = virtual_memory().total * 1e-9  # System memory in gb
    print("System memory: ", mem)

    if num_processes:
        print('Num processes defined by user')
        num_processes = num_processes
    else:
        if gb_per_process:
            max_precesses_mem = max(int(mem / float(gb_per_process)), 1)
            if max_precesses_mem < cpu_count:
                print('Num processes limited by memory')
                num_processes = max_precesses_mem
            else:
                print('Num processes limited by cpu count')
                num_processes = cpu_count
        else:
            print('Num processes limited by cpu count')
            num_processes = cpu_count

    print("Num processes:", num_processes)

    pool = multiprocessing.Pool(processes=num_processes)

    return pool

def run_apply_async_multiprocessing(func, argument_list, num_processes=None, gb_per_process=None):
    # From: https://leimao.github.io/blog/Python-tqdm-Multiprocessing/

    pool = get_multiprocessing_pool(num_processes=num_processes, gb_per_process=gb_per_process)

    jobs = [pool.apply_async(func=func, args=(*argument,)) if isinstance(argument, tuple)
            else pool.apply_async(func=func, args=(argument,)) for argument in argument_list]

    pool.close()
    result_list_tqdm = []
    for job in tqdm(jobs):
        result_list_tqdm.append(job.get())

    return result_list_tqdm


In [2]:
def targz_to_gzip(filepath, run_dir):    
    # print(filepath)
    decompressed_file_paths = decompress_targz(in_file_path=filepath, out_file_path=run_dir)
    decompressed_file_path = decompressed_file_paths[0]
    
    gz_file_path = os.path.join(os.path.dirname(filepath), os.path.basename(decompressed_file_path) + '.gz')
    # print(gz_file_path)
    compress_gzip(decompressed_file_path, gz_file_path, compresslevel=5)

    for decompressed_file_path in decompressed_file_paths:
        os.remove(decompressed_file_path)

    # Remove the original file
    os.remove(filepath)

In [3]:
data_dir = '/home/sam/Documents/ESA/data/sim/simput'

# data_dir = '/home/sam/Documents/ESA/data/sim/test_dataset'

tar_gz_files = []

for dirpath, dirnames, filenames in os.walk(data_dir):
    for filename in filenames:
        if filename.endswith('.tar.gz'):
            tar_gz_files.append(os.path.join(dirpath, filename))

In [4]:
run_dir = '/home/sam/Documents/ESA/data/sim/tmp_gzip'
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

In [5]:
from tqdm import tqdm
import multiprocessing
from psutil import virtual_memory

argument_list = []
for filepath in tar_gz_files:
    argument_list.append((filepath, run_dir))

run_apply_async_multiprocessing(func=targz_to_gzip, argument_list=argument_list, num_processes=0, gb_per_process=0.1)

print("Done")

Number of cpu :  12
System memory:  16.520716288000003
Num processes limited by cpu count
Num processes: 12


0it [00:00, ?it/s]

Done





In [6]:
# remove rundir
shutil.rmtree(run_dir)