In [None]:
import pandas as pd
from hdfs import InsecureClient
import os

# --- Configuration based on your docker-compose.yml ---
# Use the service name 'namenode' for inter-container communication
# WebHDFS REST API runs on port 9870 in this setup.
HDFS_HOST = 'http://namenode:9870' 
HDFS_USER = 'root' # Common default for bde2020/hadoop images

# Local file names and their corresponding HDFS target paths
files_to_upload = {
    'weather_cleaned.parquet': 'clean_data/weather_cleaned.parquet',
    'traffic_cleaned.parquet': 'clean_data/traffic_cleaned.parquet'
}

# --- HDFS Operations ---
try:
    client = InsecureClient(HDFS_HOST, user=HDFS_USER)
    print(f"Connected to HDFS WebHDFS API at {HDFS_HOST} as user {HDFS_USER}")

    # Create the HDFS target directory if it doesn't exist
    # client.makedirs will also create parent directories if necessary
    target_dir = 'clean_data'
    client.makedirs(target_dir, permission=755)
    print(f"Ensured HDFS directory {target_dir} exists.")
    
except Exception as e:
    print(f"Error connecting to HDFS or creating directory: {e}")
    exit(1) # Exit if connection fails

# Upload the files
for local_file, hdfs_path in files_to_upload.items():
    if not os.path.exists(local_file):
        print(f"Local file not found: **{local_file}**. Skipping.")
        # IMPORTANT: You must ensure these files exist in the same directory as the script 
        # or mount them into the container.
        continue
        
    try:
        # The upload function streams the local file to HDFS
        # overwrite=True is useful for repeated testing
        print(f"Attempting to upload {local_file} to HDFS path: {hdfs_path}")
        client.upload(hdfs_path, local_file, overwrite=True, n_threads=1)
        print(f"Successfully uploaded: {local_file}")
    except Exception as e:
        print(f"Error uploading {local_file}: {e}")

# Optional: Verify files in HDFS
print("\nVerifying files in HDFS:")
try:
    print(client.list('clean_data'))
except Exception as e:
    print(f"Error listing HDFS directory: {e}")

Connected to HDFS WebHDFS API at http://namenode:9870 as user root
Error connecting to HDFS or creating directory: HTTPConnectionPool(host='namenode', port=9870): Max retries exceeded with url: /webhdfs/v1/?user.name=root&op=GETHOMEDIRECTORY (Caused by NameResolutionError("HTTPConnection(host='namenode', port=9870): Failed to resolve 'namenode' ([Errno -3] Temporary failure in name resolution)"))
Local file not found: **weather_cleaned.parquet**. Skipping.
Local file not found: **traffic_cleaned.parquet**. Skipping.

Verifying files in HDFS:
Error listing HDFS directory: HTTPConnectionPool(host='namenode', port=9870): Max retries exceeded with url: /webhdfs/v1/?user.name=root&op=GETHOMEDIRECTORY (Caused by NameResolutionError("HTTPConnection(host='namenode', port=9870): Failed to resolve 'namenode' ([Errno -3] Temporary failure in name resolution)"))
