In [1]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate("/home/mlops/project/DeltaLake/firebaseServiceAccountKey.json")
firebase_admin.initialize_app(cred)
db = firestore.client()

In [15]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [87]:
docs = db.collection('spotify-user-data').stream()

# for doc in docs:
#     print(f'{doc.update_time} => {doc.to_dict()}')


In [49]:
def flatten(df):
    df = df.withColumn("url", df.external_urls.spotify)
    df = df.withColumn("followers", df.followers.total)
    df = df.withColumn("image_url", df.images[0].url)

    df = df.drop('external_urls', 'images')
    return df


In [88]:
import pyspark.pandas as ps

spark_df = None
# print(docs)
for doc in docs:

    user_data = doc.to_dict()
    if 'items' in user_data and len(user_data['items']) > 0:
        pandas_df = ps.DataFrame(user_data['items'])
        pandas_df['uid'] = doc.id
        pandas_df['create_time'] = doc.create_time
        pandas_df['update_time'] = doc.update_time
        pandas_df['read_time'] = doc.read_time

        if spark_df is None:

            spark_df = flatten(pandas_df.to_spark())
        else:
            # pandas_df.to_spark().show()
            spark_df = spark_df.unionByName(flatten(pandas_df.to_spark()))
            spark_df.count()

if spark_df is not None:
    spark_df.show()



+--------------------+--------------------+----------+---------+--------------------+---------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                href|                 uri|popularity|followers|                  id|           name|              genres|  type|                 uid|         create_time|         update_time|           read_time|                 url|           image_url|
+--------------------+--------------------+----------+---------+--------------------+---------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://api.spoti...|spotify:artist:3t...|        82|   148825|3t2iKODSDyzoDJw7A...|Bibi Blocksberg|         [hoerspiel]|artist|Q3t2J73kpIcMqGBmB...|2021-12-06 18:23:...|2022-01-08 11:26:...|2022-01-09 16:38:...|https://open.spot..

+--------------------+--------------------+----------+---------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                href|                 uri|popularity|followers|                  id|                name|              genres|  type|                 uid|         create_time|         update_time|           read_time|                 url|           image_url|
+--------------------+--------------------+----------+---------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://api.spoti...|spotify:artist:3t...|        82|   148825|3t2iKODSDyzoDJw7A...|     Bibi Blocksberg|         [hoerspiel]|artist|Q3t2J73kpIcMqGBmB...|2021-12-06 18:23:...|2022-01-08 11:26:...|2022-01-09 16:38:...

In [93]:
spark_df.write.format("delta").mode("overwrite").save("/home/mlops/project/DeltaLake/bronze_data/spotify_user_data_table")

In [94]:

spotify = spark.read.format('delta').load('/home/mlops/project/DeltaLake/bronze_data/spotify_user_data_table')
spotify.show()

+--------------------+--------------------+----------+---------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                href|                 uri|popularity|followers|                  id|                name|              genres|  type|                 uid|         create_time|         update_time|           read_time|                 url|           image_url|
+--------------------+--------------------+----------+---------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://api.spoti...|spotify:artist:71...|        83| 21527801|711MCceyCBcFnzjGY...|               AC/DC|[australian rock,...|artist|uagVX8DeaePP88Qz5...|2021-12-11 12:21:...|2021-12-11 12:21:...|2022-01-09 16:38:...