In [1]:
! pip install azure-storage-blob requests pandas

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.25.0-py3-none-any.whl.metadata (26 kB)
Collecting azure-core>=1.30.0 (from azure-storage-blob)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting cryptography>=2.1.4 (from azure-storage-blob)
  Downloading cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading azure_storage_blob-12.25.0-py3-none-any.whl (406 kB)
Downloading azure_core-1.32.0-py3-none-any.whl (198 kB)
Downloading cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl (6.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, cryptography, azure-core, azure-storage-blob
Successfully installed azure-core-1.32.0 azure

In [177]:
import os
import json
import hashlib
from datetime import datetime, timedelta
from tqdm import tqdm
import requests as req
from azure.storage.blob import BlobServiceClient
from hdfs import InsecureClient as HdfsClient
from io import BytesIO
from PIL import Image
import requests as req

In [191]:
connection_string = os.getenv('AZURE_CONNECTION_STRING')
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Transportation

Find out more in: https://openrouteservice.org/dev/#/api-docs/optimization/post
Alternatives: https://github.com/graphhopper/graphhopper/blob/master/README.md#Map-Matching
https://github.com/VROOM-Project/vroom/blob/master/docs/API.md

In [205]:
from hdfs import InsecureClient
import os

def test_hdfs_connection():
    try:
        hdfs_url = os.environ['HDFS_URL']
        client = InsecureClient(hdfs_url, user='hdfs')  # use appropriate user

        # List root directory
        print("Listing root directory:")
        print(client.list('/'))

        # Write a test file
        test_path = '/tmp/test_hdfs_connection.txt'
        test_content = 'HDFS connection successful!'
        client.write(test_path, data=test_content, overwrite=True)
        print(f"File written to {test_path}")

        # Read back the file
        with client.read(test_path, encoding='utf-8') as reader:
            content = reader.read()
            print("File content:")
            print(content)

        return True
    except Exception as e:
        print(f"❌ HDFS connection failed: {e}")
        return False

# Run test
if __name__ == '__main__':
    test_hdfs_connection()


Listing root directory:
[]
File written to /tmp/test_hdfs_connection.txt
File content:
HDFS connection successful!


In [None]:
destination_ids = {
    "Barcelona": "-372490",
    "Rome": "-126693",
    "Madrid": "-390625",
    "Paris": "-1456928"
}

destination_coords = {
    'Barcelona': {'latitude': 41.3874, 'longitude': 2.1686},
    'Paris': {'latitude': 48.8575, 'longitude': 2.3514},
    'Madrid': {'latitude': 40.4167, 'longitude': 3.7033},
    'Rome': {'latitude': 41.8967, 'longitude': 12.4822}
}

accommodation_endpoint = "https://booking-com15.p.rapidapi.com/api/v1/hotels/searchHotels"
weather_endpoint = 'https://archive-api.open-meteo.com/v1/archive'

headers = {
    "x-rapidapi-key": os.environ["RAPID_API_KEY"],
    "x-rapidapi-host": os.environ["RAPID_API_HOST"]
}

accommodation_query = {
    "dest_id": '',
    "search_type": "CITY",
    "arrival_date": '',
    "departure_date": '',
    "adults": "2",
    "children_age": "0",
    "room_qty": "1",
    "page_number": "1",
    "units": "metric",
    "temperature_unit": "c",
    "languagecode": "en-us",
    "currency_code": "EUR"
}

weather_metrics = 'temperature_2m,rain,snowfall,precipitation,cloud_cover,wind_speed_10m,sunshine_duration'

weather_query = {
    'latitude': '',
    'longitude': '',
    'hourly': weather_metrics,
    'start_date': '',
    'end_date': ''
}

In [213]:
def string_to_sha256(text: str) -> str:
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

def upload_blob(container_client, blob_name: str, data: bytes) -> None:
    """
    Sube datos binarios a Azure Blob bajo blob_name.
    """
    container_client.upload_blob(name=blob_name, data=data, overwrite=True)

# HDFS helper
def save_into_hdfs(hdfs_client: HdfsClient, data: dict, hdfs_path: str, file_type='JSON') -> None:
    """
    Serializa dict a JSON y escribe directamente en HDFS en la ruta dada.
    """
    parent = os.path.dirname(hdfs_path)

    if parent and not hdfs_client.status(parent, strict=False):
        hdfs_client.makedirs(parent)
    
    if file_type == 'JSON':
        with hdfs_client.write(hdfs_path, encoding='utf-8', overwrite=True) as writer:
            writer.write(json.dumps(data))
        
    if file_type == 'PNG':
            with hdfs_client.write(hdfs_path, overwrite=True) as bin_writer:
                bin_writer.write(data)


# Trusted-processing utilities
def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
    items = {}
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_dict(v, new_key, sep=sep))
        else:
            items[new_key] = v
    return items

def cast_value(value, expected: str):
    try:
        if expected == 'int': return int(value)
        if expected == 'float': return float(value)
        if expected == 'bool': return value in ['true','1','True'] if isinstance(value, str) else bool(value)
        return str(value)
    except:
        return None

def enforce_schema(data: dict, schema: dict) -> dict:
    def enforce(d, s):
        out = {}
        for k, exp in s.items():
            val = d.get(k)
            if val is None:
                out[k] = None
            else:
                if isinstance(exp, dict):
                    out[k] = enforce(val if isinstance(val, dict) else {}, exp)
                elif isinstance(exp, list):
                    if exp and isinstance(exp[0], dict):
                        out[k] = [enforce(item, exp[0]) for item in val if isinstance(item, dict)]
                    else:
                        out[k] = [cast_value(item, exp[0]) for item in val] if isinstance(val, list) else []
                else:
                    out[k] = cast_value(val, exp)
        return out
    return enforce(data, schema)

def standardized_hours(timestamp: str) -> str:
    dt = datetime.fromisoformat(timestamp)
    return dt.strftime('%H:%M')

# Schema loaders
def load_json_schema(path: str) -> dict:
    with open(path) as f:
        return json.load(f)

def compress_image(image_bytes: bytes, max_width: int = 1024, quality: int = 75) -> bytes:
    """
    Compress an image by resizing to a max width while maintaining aspect ratio
    and re-encoding as JPEG with the given quality.
    """
    # Load into PIL
    with Image.open(BytesIO(image_bytes)) as img:
        # Convert to RGB if needed
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")

        # Resize if wider than max_width
        if img.width > max_width:
            ratio = max_width / float(img.width)
            new_height = int(img.height * ratio)
            img = img.resize((max_width, new_height), Image.LANCZOS)

        # Re-encode
        buffer = BytesIO()
        img.save(buffer, format="JPEG", quality=quality, optimize=True)
        return buffer.getvalue()


def process_accommodation_images(photo_urls: list, blob_container_client, city: str, hdfs_client: HdfsClient) -> None:
    """
    Descarga, comprime, y sube imágenes de alojamiento a Azure Blob.
    """
    for url in photo_urls:
        sha = string_to_sha256(url)
        blob_name = f"accommodation_images/{city}/{sha}.jpg"
        # request image
        res = req.get(url, stream=True)
        if res.status_code == 200:
            # compress before upload
            image = res.content
            upload_blob(blob_container_client, blob_name, image)
            compressed = compress_image(image, max_width=800, quality=70)
            hdfs_path = f'/trusted_{blob_name}'
            if not hdfs_client.status(hdfs_path, strict=False):
                save_into_hdfs(hdfs_client, compressed, hdfs_path, file_type='PNG')

# Core: process one accommodation JSON into HDFS

def process_accommodation_record(record: dict, schema: dict) -> dict:
    flat = flatten_dict(record)
    flat['property_photoHash'] = [string_to_sha256(u) for u in flat.get('property_photoUrls', [])]
    flat.pop('property_photoUrls', None)
    return enforce_schema(flat, schema)

# Core: process one weather JSON into HDFS

def process_weather_record(raw: dict, schema: dict) -> dict:
    hourly = raw.get('hourly', {})
    hourly['time'] = [standardized_hours(t) for t in hourly.get('time', [])]
    return enforce_schema(hourly, schema)


# Missing detection: list landing blobs and corresponding HDFS paths
def find_missing_blobs(blob_client, landing_prefix: str, hdfs_client: HdfsClient, hdfs_prefix: str):
    missing = []
    cities = ['Barcelona', 'Paris', 'Rome', 'Madrid']
    for blob in blob_client.list_blobs(name_starts_with=landing_prefix):
        for city in cities:
            if blob.name.endswith('.json') or blob.name.endswith('.jpg'):
                fname = os.path.basename(blob.name)
                hdfs_path = f"/{hdfs_prefix}{city}/{fname}" 
                if not hdfs_client.status(hdfs_path, strict=False):
                    print(f"File not found: {hdfs_path}")
                    missing.append(blob)
    return missing

# Landing + immediate sync

def get_and_sync_accommodation(
    blob_service_client: BlobServiceClient,
    hdfs_client: HdfsClient,
    start: datetime,
    end: datetime,
    cities: dict,
    query_template: dict,
    headers: dict,
    schema_file: str,
    landing_container: str = 'bdmcontainerp1',
    images_container: str = 'bdmcontainerp1',
    hdfs_base: str = '/trusted_accommodation'
):
    schema = load_json_schema(schema_file)
    landing_client = blob_service_client.get_container_client(landing_container)
    images_client = blob_service_client.get_container_client(images_container)
    delta = timedelta(days=1)

    for single_date in tqdm([start + i * delta for i in range((end - start).days + 1)]):
        arrival = single_date.strftime('%Y-%m-%d')
        departure = (single_date + delta).strftime('%Y-%m-%d')
        for city, dest_id in cities.items():
            params = dict(query_template, dest_id=dest_id, arrival_date=arrival, departure_date=departure)
            res = req.get(accommodation_endpoint, headers=headers, params=params)
            res.raise_for_status()
            data = res.json()
            c = 0
            
            while 'data' not in data.keys() and c < 3:
                res = req.get(accommodation_endpoint, headers=headers, params=params)
                res.raise_for_status()
                data = res.json()
                c += 1

            # upload landing JSON
            landing_blob = f"accommodation/{city}/{arrival}_{departure}.json"
            upload_blob(landing_client, landing_blob, json.dumps(data).encode('utf-8'))

            # upload images
            photo_urls = [u for h in data['data']['hotels'] for u in h['property']['photoUrls']]
            process_accommodation_images(photo_urls, images_client, city)

            # sync to trusted HDFS
            docs = [process_accommodation_record(r, schema) for r in data['data']['hotels']]
            hdfs_path = f"{hdfs_base}/{city}/{arrival}_{departure}.json"
            if not hdfs_client.status(hdfs_path, strict=False):
                save_into_hdfs(hdfs_client, docs, hdfs_path)


# Similarly for weather

def get_and_sync_weather(
    blob_service_client: BlobServiceClient,
    hdfs_client: HdfsClient,
    start: datetime,
    end: datetime,
    coords: dict,
    query_template: dict,
    schema_file: str,
    landing_container: str = 'bdmcontainerp1',
    hdfs_base: str = '/trusted_weather'
):
    schema = load_json_schema(schema_file)
    container = blob_service_client.get_container_client(landing_container)
    delta = timedelta(days=1)

    for single_date in tqdm([start + i * delta for i in range((end - start).days + 1)]):
        start_prev = (single_date - timedelta(days=365)).strftime('%Y-%m-%d')
        end_prev = (single_date - timedelta(days=365) + delta).strftime('%Y-%m-%d')
        for city, coord in coords.items():
            # fetch landing data
            params = dict(query_template, latitude=coord['latitude'], longitude=coord['longitude'], start_date=start_prev, end_date=end_prev)
            res = req.get(weather_endpoint, params=params)
            res.raise_for_status()
            data = res.json()
            # upload landing
            landing_blob = f"weather/{city}/{start_prev}.json"
            container.upload_blob(name=landing_blob, data=json.dumps(data), overwrite=True)
            # sync new to trusted
            doc = process_weather_record(data, schema)
            hdfs_path = f"{hdfs_base}/{city}/{start_prev}.json"
            if not hdfs_client.status(hdfs_path, strict=False):
                save_into_hdfs(hdfs_client, doc, hdfs_path)

# Function to backfill missing files
def backfill_missing(
    blob_service_client: BlobServiceClient,
    hdfs_client: HdfsClient,
    landing_prefix: str,
    hdfs_prefix: str,
    processor: callable,
    schema_file: str = ''
):
    if schema_file:
        schema = load_json_schema(schema_file)
    container = blob_service_client.get_container_client('bdmcontainerp1')
    missing = find_missing_blobs(container, landing_prefix, hdfs_client, hdfs_prefix)
    
    if missing:
        for blob in tqdm(missing):
            data_raw = container.get_blob_client(blob).download_blob().readall()
            if landing_prefix.startswith('accommodation/'):
                data = json.loads(data_raw)
                docs = [process_accommodation_record(r, schema) for r in data['data']['hotels']]
                save_into_hdfs(hdfs_client, docs, f"trusted_{blob.name}")
            elif landing_prefix.startswith('weather/'):
                data = json.loads(data_raw)
                doc = process_weather_record(data, schema)
                save_into_hdfs(hdfs_client, doc, f"trusted_{blob.name}")
            elif landing_prefix.startswith('accommodation_images/'):
                image = compress_image(data_raw)
                save_into_hdfs(hdfs_client, image, f"trusted_{blob.name}", file_type='PNG')


In [None]:
blob_service = BlobServiceClient.from_connection_string(os.getenv('AZURE_CONNECTION_STRING'))
hdfs_service = HdfsClient(os.environ['HDFS_URL'])

# parameters
start = datetime.strptime('2025-05-04', '%Y-%m-%d')
end = datetime.strptime('2025-05-05', '%Y-%m-%d')
cities = list(destination_ids.keys())

# sync on the fly
get_and_sync_accommodation(blob_service, hdfs_service, start, end, destination_ids, accommodation_query, headers, 'accomodation_schema.json')
get_and_sync_weather(blob_service, hdfs_service, start, end, destination_coords, weather_query, 'weather_schema.json')

# backfill any missing
backfill_missing(blob_service, hdfs_service, 'accommodation/', 'trusted_accommodation/', process_accommodation_record, 'accomodation_schema.json')
backfill_missing(blob_service, hdfs_service, 'accommodation_images/', 'trusted_accommodation_images/', process_accommodation_record)
backfill_missing(blob_service, hdfs_service, 'weather/', 'trusted_weather/', process_weather_record, 'weather_schema.json')

100%|██████████| 2/2 [02:54<00:00, 87.41s/it]
100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


In [216]:
backfill_missing(blob_service, hdfs_service, 'accommodation/', 'trusted_accommodation/', process_accommodation_record, 'accomodation_schema.json')
backfill_missing(blob_service, hdfs_service, 'accommodation_images/', 'trusted_accommodation_images/', process_accommodation_record)
backfill_missing(blob_service, hdfs_service, 'weather/', 'trusted_weather/', process_weather_record, 'weather_schema.json')

File not found: /trusted_accommodation/Barcelona/2025-03-24_2025-03-25.json
File not found: /trusted_accommodation/Paris/2025-03-24_2025-03-25.json
File not found: /trusted_accommodation/Rome/2025-03-24_2025-03-25.json
File not found: /trusted_accommodation/Madrid/2025-03-24_2025-03-25.json
File not found: /trusted_accommodation/Barcelona/2025-03-25_2025-03-26.json
File not found: /trusted_accommodation/Paris/2025-03-25_2025-03-26.json
File not found: /trusted_accommodation/Rome/2025-03-25_2025-03-26.json
File not found: /trusted_accommodation/Madrid/2025-03-25_2025-03-26.json
File not found: /trusted_accommodation/Barcelona/2025-03-26_2025-03-27.json
File not found: /trusted_accommodation/Paris/2025-03-26_2025-03-27.json
File not found: /trusted_accommodation/Rome/2025-03-26_2025-03-27.json
File not found: /trusted_accommodation/Madrid/2025-03-26_2025-03-27.json
File not found: /trusted_accommodation/Barcelona/2025-03-27_2025-03-28.json
File not found: /trusted_accommodation/Paris/202

100%|██████████| 1536/1536 [04:28<00:00,  5.71it/s]


File not found: /trusted_accommodation_images/Barcelona/009c8c0ebf91545ff6880a6127ea7c39e3c00c238bcebab11117882e720194b6.jpg
File not found: /trusted_accommodation_images/Paris/009c8c0ebf91545ff6880a6127ea7c39e3c00c238bcebab11117882e720194b6.jpg
File not found: /trusted_accommodation_images/Rome/009c8c0ebf91545ff6880a6127ea7c39e3c00c238bcebab11117882e720194b6.jpg
File not found: /trusted_accommodation_images/Madrid/009c8c0ebf91545ff6880a6127ea7c39e3c00c238bcebab11117882e720194b6.jpg
File not found: /trusted_accommodation_images/Barcelona/010bce142909466fefd067409690a6c3c6c3b0510ea750e58649db55e0abbaf5.jpg
File not found: /trusted_accommodation_images/Paris/010bce142909466fefd067409690a6c3c6c3b0510ea750e58649db55e0abbaf5.jpg
File not found: /trusted_accommodation_images/Rome/010bce142909466fefd067409690a6c3c6c3b0510ea750e58649db55e0abbaf5.jpg
File not found: /trusted_accommodation_images/Madrid/010bce142909466fefd067409690a6c3c6c3b0510ea750e58649db55e0abbaf5.jpg
File not found: /trusted

100%|██████████| 7500/7500 [28:18<00:00,  4.42it/s]  


File not found: /trusted_weather/Barcelona/2024-03-24.json
File not found: /trusted_weather/Paris/2024-03-24.json
File not found: /trusted_weather/Rome/2024-03-24.json
File not found: /trusted_weather/Madrid/2024-03-24.json
File not found: /trusted_weather/Barcelona/2024-03-25.json
File not found: /trusted_weather/Paris/2024-03-25.json
File not found: /trusted_weather/Rome/2024-03-25.json
File not found: /trusted_weather/Madrid/2024-03-25.json
File not found: /trusted_weather/Barcelona/2024-03-26.json
File not found: /trusted_weather/Paris/2024-03-26.json
File not found: /trusted_weather/Rome/2024-03-26.json
File not found: /trusted_weather/Madrid/2024-03-26.json
File not found: /trusted_weather/Barcelona/2024-03-27.json
File not found: /trusted_weather/Paris/2024-03-27.json
File not found: /trusted_weather/Rome/2024-03-27.json
File not found: /trusted_weather/Madrid/2024-03-27.json
File not found: /trusted_weather/Barcelona/2024-03-28.json
File not found: /trusted_weather/Paris/2024-0

100%|██████████| 1552/1552 [04:24<00:00,  5.87it/s]
