In [None]:
import os
import requests
import csv
import time
import json
from google.cloud import storage

# Configurar la autenticación con la clave de servicio JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "proyectofinalgogleyelp-41e96ec7a40a.json"

# Configuración de APIs (REEMPLAZAR CON TUS KEYS)
GOOGLE_API_KEY = "AIzaSyBVFqawIp7wy2kTnY7ZVt9veOS7-yKfohU"
YELP_API_KEY = "81MeibfW8d_OHhY9oRupbSWAy3cBuNrvOZJPNC3hQWkPa-ZFMIdDKeN3_pu6G8cVOvbR-1r8hIPxWqQV_u56wDQPPHr4kDve44nvtlxE7e3RMrq2M72jXM9ZXwTPZ3Yx"

# Bucket y carpeta en Google Cloud Storage
BUCKET_NAME = "dataset-pf-gyelp"
FOLDER_PATH = "Yelp/airFlow/raw/"

# Inicializar cliente de Google Cloud Storage
storage_client = storage.Client()

def clean_bucket(bucket_name, folder_path):
    """Elimina archivos existentes en la carpeta del bucket antes de subir nuevos archivos."""
    bucket = storage_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_path))
    
    for blob in blobs:
        blob.delete()
    print(f"Carpeta {folder_path} en el bucket {bucket_name} limpiada.")

def fetch_google_restaurants(location="California", pages=3):
    """Obtiene datos de restaurantes desde Google Maps API."""
    base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    restaurants = []
    next_page_token = None
    
    for _ in range(pages):
        params = {
            "query": "restaurants in " + location,
            "key": GOOGLE_API_KEY,
            "type": "restaurant",
            "language": "es",
            "pagetoken": next_page_token if next_page_token else ""
        }
        response = requests.get(base_url, params=params)
        data = response.json()

        for result in data.get("results", []):
            restaurants.append({
                "Business_ID": result.get("place_id"),
                "Name": result.get("name"),
                "Address": result.get("formatted_address"),
                "Rating": result.get("rating"),
                "Latitude": result.get("geometry", {}).get("location", {}).get("lat"),
                "Longitude": result.get("geometry", {}).get("location", {}).get("lng"),
            })
        
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break
        time.sleep(2)
    
    return restaurants

def fetch_yelp_restaurants(location="California"):
    """Obtiene datos de restaurantes desde Yelp API."""
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {"location": location, "term": "restaurants", "limit": 50}
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    return [{
        "Business_ID": business.get("id"),
        "Name": business.get("name"),
        "Address": ", ".join(business.get("location", {}).get("display_address", [])),
        "Rating": business.get("rating"),
        "Latitude": business.get("coordinates", {}).get("latitude"),
        "Longitude": business.get("coordinates", {}).get("longitude"),
    } for business in data.get("businesses", [])]

def fetch_yelp_reviews(business_id):
    """Obtiene reseñas reales de Yelp para un negocio específico."""
    url = f"https://api.yelp.com/v3/businesses/{business_id}/reviews"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        return [{
            "Review_ID": review.get("id"),
            "Business_ID": business_id,
            "User_ID": review.get("user", {}).get("id"),
            "Rating": review.get("rating"),
            "Text": review.get("text"),
            "Date": review.get("time_created"),
        } for review in data.get("reviews", [])]
    else:
        print(f"Error al obtener reseñas para {business_id}: {response.status_code}")
        return []

def fetch_yelp_users(business_id):
    """Obtiene usuarios de Yelp desde las reseñas de un negocio específico."""
    url = f"https://api.yelp.com/v3/businesses/{business_id}/reviews"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        users = []
        for review in data.get("reviews", []):
            user_info = review.get("user", {})
            users.append({
                "User_ID": user_info.get("id"),
                "Name": user_info.get("name"),
                "Review_Count": None  # Yelp no devuelve este dato directamente
            })
        return users
    else:
        print(f"Error al obtener usuarios para {business_id}: {response.status_code}")
        return []

def save_to_csv(data, filename):
    """Guarda los datos en CSV."""
    if not data:
        return
    
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)
    print(f"Archivo guardado: {filename}")

def upload_to_bucket(source_file, bucket_name, folder_path):
    """Sube archivos a Google Cloud Storage en la ruta especificada."""
    bucket = storage_client.bucket(bucket_name)
    destination_blob = f"{folder_path}{os.path.basename(source_file)}"
    
    blob = bucket.blob(destination_blob)
    blob.upload_from_filename(source_file)
    print(f"Archivo {source_file} subido a gs://{bucket_name}/{destination_blob}")

# Ejecución principal
if __name__ == "__main__":
    try:
        print("Limpiando bucket...")
        clean_bucket(BUCKET_NAME, FOLDER_PATH)
        
        print("Obteniendo datos...")
        google_data = fetch_google_restaurants()
        yelp_data = fetch_yelp_restaurants()
        reviews_data = fetch_yelp_reviews()
        users_data = fetch_yelp_users()
        
        print("Guardando datos en CSV...")
        save_to_csv(google_data + yelp_data, "business_cleaned.csv")
        save_to_csv(reviews_data, "reviews_cleaned.csv")
        save_to_csv(users_data, "users_cleaned.csv")
        save_to_csv(reviews_data, "review_cleaned.csv")
        
        print("Subiendo archivos a Cloud Storage...")
        upload_to_bucket("business_cleaned.csv", BUCKET_NAME, FOLDER_PATH)
        upload_to_bucket("reviews_cleaned.csv", BUCKET_NAME, FOLDER_PATH)
        upload_to_bucket("users_cleaned.csv", BUCKET_NAME, FOLDER_PATH)
        upload_to_bucket("review_cleaned.csv", BUCKET_NAME, FOLDER_PATH)
        
        print("Proceso completado con éxito.")
    
    except Exception as e:
        print(f"Error: {str(e)}")

Limpiando bucket...
Carpeta Yelp/airFlow/raw/ en el bucket dataset-pf-gyelp limpiada.
Obteniendo datos...
Error: fetch_yelp_reviews() missing 1 required positional argument: 'business_id'
