In [2]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
ticketmaster = spark.read.format('delta').load('/home/mlops/project/DeltaLake/gold_data/ticketmaster_cleansed_table')
ticketmaster.show()

+-----------------+--------------------+--------------------+--------------------+----------------+------------------+------------+--------------------+--------------------+---------------+-------------------+---------------------+------------------+----------------------+----------------------+--------------------+--------------------+-----------------+-----------------------+---------------------+----------------------+------------------------------+---------------------------+-----------------------+--------------------------+--------------------------+--------------------+--------------------------+---------------------------+----------------------------+
|spotify_followers|      spotify_genres|        spotify_href|          spotify_id|    spotify_name|spotify_popularity|spotify_type|         spotify_uri|   ticketmaster_name|ticketmaster_id|ticketmaster_locale|ticketmaster_distance|ticketmaster_units|ticketmaster_price_min|ticketmaster_price_max|         spotify_url|   spotify_imag

In [4]:
genre_map = spark.read.format('delta').load('/home/mlops/project/DeltaLake/bronze_data/genre_map_table')
genre_map.show()


+--------------+--------------------+--------------------+
|        genres|       main_genre_18|        main_genre_9|
+--------------+--------------------+--------------------+
|           pop|                 Pop|                 Pop|
|     dance pop|                 Pop|                 Pop|
|           rap|     Rap and Hip-Hop|     Rap and Hip-Hop|
|          rock|                Rock|Rock and Heavy Metal|
|         latin|Latin American music|Latin American music|
|       pop rap|                 Pop|                 Pop|
|       hip hop|     Rap and Hip-Hop|     Rap and Hip-Hop|
|   trap latino|Latin American music|Latin American music|
|          trap|     Rap and Hip-Hop|     Rap and Hip-Hop|
|   modern rock|                Rock|Rock and Heavy Metal|
|           edm|Dance Music, Tech...|  Electronical music|
| post-teen pop|                 Pop|                 Pop|
|     reggaeton|Latin American music|Latin American music|
|     pop dance|Dance Music, Tech...|  Electronical musi

In [5]:
from pyspark.sql.types import Row


ticketmaster_genre_agg = ticketmaster


genre_list = genre_map.to_pandas_on_spark().to_dict('list')
def f(row):

    genre_set_9 = set()
    genre_set_18 = set()
    for x in row.spotify_genres:
        if x in genre_list['genres']:
            index = genre_list['genres'].index(x)
            genre_set_9.add(genre_list['main_genre_9'][index])
            genre_set_18.add(genre_list['main_genre_18'][index])
    row = row.asDict()
    row['spotify_genres_9'] = list(genre_set_9)
    row['spotify_genres_18'] = list(genre_set_18)
    return Row(**row)

ticketmaster_genre_agg = ticketmaster_genre_agg.rdd.map(f)
ticketmaster_genre_agg = ticketmaster_genre_agg.toDF()
ticketmaster_genre_agg.to_pandas_on_spark().head()

Unnamed: 0,spotify_followers,spotify_genres,spotify_href,spotify_id,spotify_name,spotify_popularity,spotify_type,spotify_uri,ticketmaster_name,ticketmaster_id,ticketmaster_locale,ticketmaster_distance,ticketmaster_units,ticketmaster_price_min,ticketmaster_price_max,spotify_url,spotify_image_url,ticketmaster_date,ticketmaster_venue_name,ticketmaster_venue_id,ticketmaster_venue_url,ticketmaster_venue_postal_code,ticketmaster_venue_timezone,ticketmaster_venue_city,ticketmaster_venue_country,ticketmaster_venue_address,ticketmaster_url,ticketmaster_artist_locale,ticketmaster_venue_latitude,ticketmaster_venue_longitude,spotify_genres_9,spotify_genres_18
0,66848,"[album rock, mellow gold, new wave pop, philly...",https://api.spotify.com/v1/artists/7uhvDINTTiD...,7uhvDINTTiD0XBrP9fquN1,The Hooters,56.0,artist,spotify:artist:7uhvDINTTiD0XBrP9fquN1,The Hooters,Z698xZC2Z17uPao,en-us,67.49,MILES,35.0,35.0,https://open.spotify.com/artist/7uhvDINTTiD0XB...,https://i.scdn.co/image/ab6761610000e5eb15896b...,2022-06-28,Musiktheater REX,Z698xZC2Za7r9,https://www.ticketmaster.de/venue/musiktheater...,64625,Europe/Berlin,Bensheim,Germany,Fabrikstraße 10,http://www.ticketmaster.de/artist/918988,en-us,49.68074,8.61964,"[Rock and Heavy Metal, Pop]","[Rock, Pop]"
1,760031,"[francoton, pop urbaine]",https://api.spotify.com/v1/artists/7gU9VyFRN3J...,7gU9VyFRN3JWPJ5oHOil60,Tayc,76.0,artist,spotify:artist:7gU9VyFRN3JWPJ5oHOil60,TAYC,rZ7SnyZ1Ad700d,fr-fr,67.72,MILES,26.0,26.0,https://open.spotify.com/artist/7gU9VyFRN3JWPJ...,https://i.scdn.co/image/ab6761610000e5ebce9a6f...,2021-12-16,LA LAITERIE,rZFSnyZeeFa,https://www.ticketmaster.fr/fr/salle/la-laiter...,67000,Europe/Paris,Strasbourg,France,"13,rue du Hohwald",http://www.ticketmaster.fr/artist/1044372,en-us,48.57609,7.729777,[Pop],[Pop]
2,876137,"[electro swing, nu jazz]",https://api.spotify.com/v1/artists/37J1PlAkhRK...,37J1PlAkhRK7yrZUtqaUpQ,Caravan Palace,66.0,artist,spotify:artist:37J1PlAkhRK7yrZUtqaUpQ,CARAVAN PALACE,rZ7SnyZ1Adk7qv,fr-fr,67.72,MILES,34.2,34.2,https://open.spotify.com/artist/37J1PlAkhRK7yr...,https://i.scdn.co/image/ab6761610000e5ebdedbe7...,2021-12-18,LA LAITERIE,rZFSnyZeeFa,https://www.ticketmaster.fr/fr/salle/la-laiter...,67000,Europe/Paris,Strasbourg,France,"13,rue du Hohwald",http://www.ticketmaster.fr/artist/141183,en-us,48.57609,7.729777,"[Electronical music, Rhythm music]","[Jazz, Dance Music, Techno and House]"
3,139871,[cali rap],https://api.spotify.com/v1/artists/1grN0519h2z...,1grN0519h2zYqpRtYbDZAl,Larry June,67.0,artist,spotify:artist:1grN0519h2zYqpRtYbDZAl,LARRY,rZ7SnyZ1AduOGE,fr-fr,67.72,MILES,26.0,26.0,https://open.spotify.com/artist/1grN0519h2zYqp...,https://i.scdn.co/image/ab6761610000e5eb1c0b98...,2022-01-15,LA LAITERIE,rZFSnyZeeFa,https://www.ticketmaster.fr/fr/salle/la-laiter...,67000,Europe/Paris,Strasbourg,France,"13,rue du Hohwald",,,48.57609,7.729777,[Rap and Hip-Hop],[Rap and Hip-Hop]
4,40482,"[electro swing, nu jazz]",https://api.spotify.com/v1/artists/6jX8VHUJomY...,6jX8VHUJomYSfi5Hobdmmn,Lyre Le Temps,47.0,artist,spotify:artist:6jX8VHUJomYSfi5Hobdmmn,LYRE LE TEMPS,rZ7SnyZ1AdGyqo,fr-fr,67.72,MILES,17.0,17.0,https://open.spotify.com/artist/6jX8VHUJomYSfi...,https://i.scdn.co/image/ab6761610000e5eb78c466...,2022-01-21,LA LAITERIE,rZFSnyZeeFa,https://www.ticketmaster.fr/fr/salle/la-laiter...,67000,Europe/Paris,Strasbourg,France,"13,rue du Hohwald",,,48.57609,7.729777,"[Electronical music, Rhythm music]","[Jazz, Dance Music, Techno and House]"


In [None]:

ticketmaster_genre_agg.write.format("delta").mode("overwrite").save("/home/mlops/project/DeltaLake/platinum_data/ticketmaster_genre_agg_table")

In [None]:

ticketmaster = spark.read.format('delta').load('/home/mlops/project/DeltaLake/platinum_data/ticketmaster_genre_agg_table')
ticketmaster.to_pandas_on_spark()