In [0]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from koboextractor import KoboExtractor
from pyspark.sql.functions import col
from pyspark.sql.types import ArrayType, StringType, DoubleType

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Kobo credentials and API setup
api_token = '64deabcd2830d55dafc9a765cf2453c41fde07fc'  # Replace with secure storage in production
form_id = 'aKhrYSrKMa6Qtfgzvfjnhn'
kobo_base_url = 'https://kobo.humanitarianresponse.info/api/v2'

# Initialize KoboExtractor
kobo = KoboExtractor(api_token, kobo_base_url)

# Function to fetch KoboToolbox data and return as a Spark DataFrame
def fetch_kobo_to_spark(api_token, form_id, base_url):
    try:
        # Fetch data from KoboToolbox API
        data = kobo.get_data(form_id)

        if "results" not in data:
            raise KeyError("'results' key not found in KoboToolbox API response.")

        # Flatten the JSON response
        df = pd.json_normalize(data["results"])

        # Optional: Convert timestamp fields to datetime if present
        if 'submission_time' in df.columns:
            df['submission_time'] = pd.to_datetime(df['submission_time'])

        # Convert pandas DataFrame to Spark DataFrame
        spark_df = spark.createDataFrame(df)

        return spark_df
    except Exception as e:
        raise RuntimeError(f"Failed to fetch or process data: {e}")

# Function to ingest data once
def ingest_kobo_data(api_token, form_id, base_url):
    try:
        # Fetch data and convert to Spark DataFrame
        spark_df = fetch_kobo_to_spark(api_token, form_id, base_url)

        # Fix NullType arrays by casting to known types for Delta compatibility
        spark_df = spark_df.withColumn(
            "_attachments", col("_attachments").cast(ArrayType(StringType()))
        ).withColumn(
            "_geolocation", col("_geolocation").cast(ArrayType(DoubleType()))
        ).withColumn(
            "_tags", col("_tags").cast(ArrayType(StringType()))
        ).withColumn(
            "_notes", col("_notes").cast(ArrayType(StringType()))
        )

        # Write to Delta table
        spark_df.write.mode("append").saveAsTable("kobo_submission")

        # Optional: Display a preview of the latest data
        display(spark_df.limit(10))

    except Exception as e:
        print(f"Error during ingestion: {e}")

# Run the ingestion
ingest_kobo_data(api_token, form_id, kobo_base_url)
