In [1]:
from hdfs import InsecureClient

# Initialize HDFS client
hdfs_client = InsecureClient("http://localhost:9870", user="root")

# List all directories in HDFS
print("=== HDFS Directory Structure ===\n")

def list_hdfs_tree(path, indent=0):
    """Recursively list HDFS directory structure"""
    try:
        contents = hdfs_client.list(path)
        for item in contents:
            full_path = f"{path}/{item}" if path != "/" else f"/{item}"
            try:
                status = hdfs_client.status(full_path)
                is_dir = status['type'] == 'DIRECTORY'
                prefix = "├── " if not is_dir else "├── [DIR] "
                print(" " * indent + prefix + item)
                
                # Recursively list subdirectories (limit depth)
                if is_dir and indent < 6:
                    list_hdfs_tree(full_path, indent + 4)
            except:
                print(" " * indent + "├── " + item)
    except Exception as e:
        print(f"Error listing {path}: {e}")

# Start from root
list_hdfs_tree("/")

# Get detailed info about /bigdata
print("\n=== /bigdata Contents ===")
try:
    contents = hdfs_client.list("/bigdata")
    for item in contents:
        full_path = f"/bigdata/{item}"
        status = hdfs_client.status(full_path)
        file_type = "DIR" if status['type'] == 'DIRECTORY' else "FILE"
        size = status.get('size', 0)
        print(f"{item:30} [{file_type:5}] Size: {size:,} bytes")
except Exception as e:
    print(f"Error: {e}")

=== HDFS Directory Structure ===

├── [DIR] bigdata
    ├── [DIR] Datapack
        ├── [DIR] Delivery
        ├── [DIR] Inventory
        ├── [DIR] PickUp
        ├── [DIR] Roadmap
    ├── [DIR] output
        ├── combined_all_data.csv

=== /bigdata Contents ===
Datapack                       [DIR  ] Size: 0 bytes
output                         [DIR  ] Size: 0 bytes


In [2]:

# Display the uploaded HDFS structure with accurate file sizes
import subprocess
import json

print("=== HDFS Directory Structure ===\n")

def parse_hdfs_ls_output(path):
    """Parse hdfs dfs -ls -R output to get accurate file information"""
    try:
        result = subprocess.run(
            ['hdfs', 'dfs', '-ls', '-R', path],
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            print(f"Error: {result.stderr}")
            return {}
        
        # Parse the output
        file_info = {}
        for line in result.stdout.strip().split('\n'):
            if not line or 'total' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 8:
                # parts[4] is size, parts[7] is full path
                size = int(parts[4])
                full_path = parts[7]
                file_info[full_path] = {
                    'size': size,
                    'is_dir': line.startswith('d')
                }
        
        return file_info
    except Exception as e:
        print(f"Error parsing HDFS output: {e}")
        return {}

def print_tree_with_sizes(path, indent=0, file_info=None):
    """Print HDFS tree with accurate sizes"""
    if file_info is None:
        file_info = {}
    
    try:
        result = subprocess.run(
            ['hdfs', 'dfs', '-ls', path],
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            return
        
        lines = [l for l in result.stdout.strip().split('\n') if l]
        
        for line in sorted(lines):
            if 'total' in line or not line.strip():
                continue
            
            parts = line.split()
            if len(parts) >= 8:
                is_dir = line.startswith('d')
                size = int(parts[4])
                filename = parts[7].split('/')[-1]
                full_path = parts[7]
                
                if is_dir:
                    prefix = "├── [DIR] "
                    print("    " * indent + prefix + filename)
                    # Recursively list subdirectories
                    if indent < 3:
                        print_tree_with_sizes(full_path, indent + 1, file_info)
                else:
                    # Format size in human-readable format
                    if size >= 1024**3:
                        size_str = f"{size / (1024**3):.2f} GB"
                    elif size >= 1024**2:
                        size_str = f"{size / (1024**2):.2f} MB"
                    elif size >= 1024:
                        size_str = f"{size / 1024:.2f} KB"
                    else:
                        size_str = f"{size} bytes"
                    
                    prefix = f"├── ({size_str})"
                    print("    " * indent + prefix + " " + filename)
    except Exception as e:
        print(f"Error: {e}")

# Start from /bigdata
print_tree_with_sizes("/bigdata")


=== HDFS Directory Structure ===

├── [DIR] Datapack
    ├── [DIR] Delivery
        ├── (4.52 MB) delivery_jl.csv
        ├── (30.07 MB) delivery_yt.csv
        ├── (138.35 MB) delivery_cq.csv
        ├── (217.50 MB) delivery_sh.csv
        ├── (273.60 MB) delivery_hz.csv
    ├── [DIR] PickUp
        ├── (41.77 MB) pickup_jl.csv
        ├── (181.38 MB) pickup_yt.csv
        ├── (181.70 MB) pickup_cq.csv
        ├── (217.76 MB) pickup_sh.csv
        ├── (320.26 MB) pickup_hz.csv
    ├── [DIR] Roadmap
        ├── (221.01 MB) roads.csv
    ├── [DIR] Inventory
        ├── (4.26 KB) product_info.csv
        ├── (84.77 MB) product_target_for_shop.csv
        ├── (111.55 MB) shop_info_with_geo.csv
├── [DIR] output
    ├── (1.35 MB) combined_all_data.csv
