In [None]:
#%pip install pyspark
from pyspark.sql import SparkSession

#Configuration
CLIENT_ID = ''
CLIENT_SECRET = ''
TENANT_ID = ''
STORAGE_ACCOUNT_NAME = ''
# create a Spark session with the Azure Data Lake Storage Gen1 or Gen2 configuration
spark = SparkSession.builder \
    .appName("AzureDataLakeQuery") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.11:0.6.0") \
    .config("spark.hadoop.fs.adl.oauth2.access.token.provider.type", "ClientCredential") \
    .config("spark.hadoop.fs.adl.oauth2.client.id", CLIENT_ID) \
    .config("spark.hadoop.fs.adl.oauth2.credential", CLIENT_SECRET) \
    .config("spark.hadoop.fs.adl.oauth2.refresh.url", f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/token") \
    .config("spark.hadoop.fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem") \
    .getOrCreate()
# Read data from Azure Data Lake Store
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("recursiveFileLookup", "true") \
    .csv(f"adl://{STORAGE_ACCOUNT_NAME}.azuredatalakestore.net/path/*.csv")
# Query the data using PySpark SQL
df.createOrReplaceTempView("my_data")
result_df = spark.sql("SELECT * FROM my_data WHERE column1 = 'value1'")
# Show the query result
result_df.show()

In [None]:

# Execute an update query on the data
result_df = spark.sql("UPDATE my_data SET column1 = 'new_value' WHERE column2 = 'value2'")
# Write the updated data back to the Azure Data Lake Store as a CSV file
# .mode("overwrite") is usually not recommended as it can be difficult to ensure data consistency and recoverability
result_df.write \
    .option("header", "true") \
    .csv("adl://<your-data-lake-store-name>.azuredatalakestore.net/path/to/output.csv")

In [None]:
# stop the Spark session
spark.stop()