In [None]:
import boto3

def fetch_json_from_s3(bucket_name, key):
    """
    Fetch a single JSON file from an S3 bucket.

    Args:
        bucket_name (str): Name of the S3 bucket
        key (str): Full path/key of the JSON file in the bucket

    Returns:
        str: Contents of the JSON file as a string
    """
    # Create S3 client
    s3_client = boto3.client('s3')
    
    try:
        # Download the file from S3
        response = s3_client.get_object(Bucket=bucket_name, Key=key)
        
        # Read the file contents
        file_contents = response['Body'].read().decode('utf-8')
        
        return file_contents
    
    except Exception as e:
        print(f"❌ Error fetching {key} from {bucket_name}: {e}")
        return None

# Example usage
# json_data = fetch_json_from_s3('your-bucket-name', 'path/to/your/file.json')

In [None]:
import json
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

def format_json(file_contents, max_lines=50, indent=2):
    """
    Format JSON contents for a prettier display.

    Args:
        file_contents (str): JSON file contents as a string
        max_lines (int, optional): Maximum number of lines to display. Defaults to 50.
        indent (int, optional): Number of spaces for JSON indentation. Defaults to 2.

    Returns:
        dict: Parsed JSON data
    """
    try:
        # Parse the JSON
        json_data = json.loads(file_contents)
        
        # Format the JSON with syntax highlighting
        formatted_json = json.dumps(json_data, indent=indent)
        highlighted_json = highlight(formatted_json, JsonLexer(), TerminalFormatter())
        
        # Print the first few lines of the formatted JSON
        lines = highlighted_json.split('\n')
        print('\n'.join(lines[:max_lines]))
        
        # If there are more lines, indicate truncation
        if len(lines) > max_lines:
            print(f"\n... and {len(lines) - max_lines} more lines (use max_lines parameter to see more)")
        
        return json_data
    
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return None

# Example usage
# json_data = format_json(file_contents)

In [None]:
# 📦 Carga de librerías
import os
import pandas as pd

# ⚙️ Función para cargar archivos .json.gz
def load_json_gz_to_dataframe(input_dir, max_files=1):
    """
    Carga hasta 'max_files' archivos .json.gz desde 'input_dir' en un solo DataFrame.
    """
    dataframes = []
    count = 0

    for root, _, files in os.walk(input_dir):
        for file_name in sorted(files):  # ordena para consistencia
            if file_name.endswith('.json.gz'):
                file_path = os.path.join(root, file_name)
                try:
                    df = pd.read_json(file_path, lines=True, compression='gzip')
                    dataframes.append(df)
                    count += 1
                    print(f"✅ Cargado: {file_path} ({len(df)} filas)")
                    if count >= max_files:
                        break
                except Exception as e:
                    print(f"❌ Error leyendo {file_path}: {e}")
        if count >= max_files:
            break

    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"\n🔢 DataFrame combinado: {combined_df.shape[0]} filas, {combined_df.shape[1]} columnas")
        return combined_df
    else:
        print("⚠️ No se encontraron archivos válidos.")
        return None
    
    # 📁 Directorio de entrada (ajústalo si es necesario)
input_dir = "gharchive_data"  # carpeta donde guardaste los .json.gz

# 📌 Cargar máximo 1 archivo por defecto
df = load_json_gz_to_dataframe(input_dir, max_files=1)


In [None]:
# 🔍 Vista rápida de las columnas disponibles
if df is not None:
    df.info()
    df.head()