In [1]:
import os
import shutil
from pathlib import Path
import tarfile
from tqdm import tqdm


In [2]:
def copy_gif_files_from_subdirs(main_directory, dest_directory):
    # Traverse the first level of subdirectories in the main directory
    for first_level_subdir in os.listdir(main_directory):
        first_level_subdir_path = os.path.join(main_directory, first_level_subdir)
        
        # Check if it's a directory (skip files in the main directory)
        if os.path.isdir(first_level_subdir_path):
            # Remove '_MR1' suffix from the folder name if it exists
            dest_folder_name = first_level_subdir
            
            # Create a corresponding directory in the destination directory
            subdir_gif_dest = os.path.join(dest_directory, dest_folder_name)
            if not os.path.exists(subdir_gif_dest):
                os.makedirs(subdir_gif_dest)  # Create destination directory for this subdirectory
            
            # Walk through the current first-level subdirectory and its subdirectories
            for root, dirs, files in os.walk(first_level_subdir_path):
                for file in files:
                    if file.endswith('.gif'):
                        src_path = os.path.join(root, file)  # Source .gif file
                        dest_path = os.path.join(subdir_gif_dest, file)  # Destination path for the .gif
                        shutil.copy2(src_path, dest_path)  # Copy the .gif file
                        # print(f"Copied: {src_path} to {dest_path}")




In [3]:
def process_tarball(tarball_path, extract_temp_dir, destination_directory):
    # Unzip the tarball into a temporary directory
    # print(f"Extracting {tarball_path}...")
    with tarfile.open(tarball_path, 'r:gz') as tar:
        tar.extractall(path=extract_temp_dir)

    # Extracted folder is based on the "discX" part of the filename
    extracted_dir_name = os.path.basename(tarball_path).split('_')[-1].replace('.tar.gz', '')
    extracted_dir_path = os.path.join(extract_temp_dir, extracted_dir_name)

    # Get total number of .gif files for progress bar
    gif_files = []
    for root, dirs, files in os.walk(extracted_dir_path):
        gif_files.extend([f for f in files if f.endswith('.gif')])
    
    # Traverse through the unzipped files and copy .gif files with progress bar
    for root, dirs, files in os.walk(extracted_dir_path):
        for file in tqdm(files, desc="Copying .gif files"):
            if file.endswith('.gif'):
                # Remove '_MR1' from the folder name when creating the destination directory
                relative_path = os.path.relpath(root, extract_temp_dir)
                dest_dir = os.path.join(destination_directory, relative_path)

                # Create the destination folder if it doesn't exist
                if not os.path.exists(dest_dir):
                    os.makedirs(dest_dir)

                # Copy the .gif file to the destination
                src_file = os.path.join(root, file)
                dest_file = os.path.join(dest_dir, file)
                shutil.copy2(src_file, dest_file)
                # print(f"Copied {src_file} to {dest_file}")
    
    # Delete the unzipped directory after processing
    shutil.rmtree(extracted_dir_path)
    # print(f"Deleted extracted files from {extracted_dir_path}")

def process_all_tarballs(source_directory, destination_directory):
    # Get list of tarballs first
    tarballs = [f for f in os.listdir(source_directory) if f.endswith('.tar.gz')]
    
    # Process each tarball one by one with progress bar
    for tarball in tqdm(tarballs, desc="Processing tarballs"):
        tarball_path = os.path.join(source_directory, tarball)
        # print(f"\nProcessing {tarball_path}...")

        # Unzip the tarball into a temporary directory
        # print("Extracting tarball...")
        with tarfile.open(tarball_path, 'r:gz') as tar:
            members = tar.getmembers()
            for member in tqdm(members, desc="Extracting files"):
                tar.extract(member, path=source_directory)

        # Extracted folder is based on the "discX" part of the filename
        extracted_dir_name = os.path.basename(tarball_path).split('_')[-1].replace('.tar.gz', '')
        disk_path = os.path.join(source_directory, extracted_dir_name)
        processed_path = os.path.join(destination_directory,extracted_dir_name)
        copy_gif_files_from_subdirs(disk_path, processed_path)

        # Delete the unzipped directory after processing
        shutil.rmtree(disk_path)
        # print(f"Deleted extracted files from {disk_path}")

In [4]:
# Paths
source_directory = "/home/rohitb/projects/ml/Project/assets"
destination_directory = "/home/rohitb/projects/ml/Project/assets/OASIS-1/processed"

# Call the function to process all tarballs
process_all_tarballs(source_directory, destination_directory)

Processing tarballs:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting files: 100%|██████████| 1596/1596 [00:51<00:00, 31.05it/s]
Extracting files: 100%|██████████| 1576/1576 [01:29<00:00, 17.61it/s]
Extracting files: 100%|██████████| 1673/1673 [01:45<00:00, 15.80it/s]
Extracting files: 100%|██████████| 1641/1641 [01:56<00:00, 14.11it/s]
Extracting files: 100%|██████████| 1555/1555 [01:47<00:00, 14.42it/s]
Extracting files: 100%|██████████| 1608/1608 [01:45<00:00, 15.30it/s]
Extracting files: 100%|██████████| 1724/1724 [01:27<00:00, 19.80it/s]
Extracting files: 100%|██████████| 1596/1596 [01:17<00:00, 20.59it/s]
Extracting files: 100%|██████████| 1734/1734 [01:52<00:00, 15.48it/s]
Extracting files: 100%|██████████| 1739/1739 [01:11<00:00, 24.23it/s]
Extracting files: 100%|██████████| 1596/1596 [01:19<00:00, 19.97it/s]
Extracting files: 100%|██████████| 1755/1755 [01:51<00:00, 15.72it/s]
Processing tarballs: 100%|██████████| 12/12 [23:24<00:00, 117.05s/it]
