In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
genre_map = spark.read.format('delta').load('/home/mlops/project/DeltaLake/bronze_data/genre_map_table')
genre_map.show()

+--------------+--------------------+--------------------+
|        genres|       main_genre_18|        main_genre_9|
+--------------+--------------------+--------------------+
|           pop|                 Pop|                 Pop|
|     dance pop|                 Pop|                 Pop|
|           rap|     Rap and Hip-Hop|     Rap and Hip-Hop|
|          rock|                Rock|Rock and Heavy Metal|
|         latin|Latin American music|Latin American music|
|       pop rap|                 Pop|                 Pop|
|       hip hop|     Rap and Hip-Hop|     Rap and Hip-Hop|
|   trap latino|Latin American music|Latin American music|
|          trap|     Rap and Hip-Hop|     Rap and Hip-Hop|
|   modern rock|                Rock|Rock and Heavy Metal|
|           edm|Dance Music, Tech...|  Electronical music|
| post-teen pop|                 Pop|                 Pop|
|     reggaeton|Latin American music|Latin American music|
|     pop dance|Dance Music, Tech...|  Electronical musi

In [3]:
from pyspark.ml.feature import StringIndexer

renamed_map = genre_map
indexer = StringIndexer(inputCol="main_genre_9", outputCol="main_genre_9_index")
renamed_map = indexer.fit(renamed_map).transform(renamed_map)

indexer = StringIndexer(inputCol="main_genre_18", outputCol="main_genre_18_index")
renamed_map = indexer.fit(renamed_map).transform(renamed_map)

indexer = StringIndexer(inputCol="genres", outputCol="genres_index")
renamed_map = indexer.fit(renamed_map).transform(renamed_map)

renamed_map = renamed_map.withColumn("main_genre_9_index",renamed_map.main_genre_9_index.cast('int'))
renamed_map = renamed_map.withColumn("main_genre_18_index",renamed_map.main_genre_18_index.cast('int'))
renamed_map = renamed_map.withColumn("genres_index",renamed_map.genres_index.cast('int'))
renamed_map.to_pandas_on_spark().head()

Unnamed: 0,genres,main_genre_18,main_genre_9,main_genre_9_index,main_genre_18_index,genres_index
0,pop,Pop,Pop,1,0,4115
1,dance pop,Pop,Pop,1,0,1276
2,rap,Rap and Hip-Hop,Rap and Hip-Hop,5,5,4327
3,rock,Rock,Rock and Heavy Metal,0,2,4469
4,latin,Latin American music,Latin American music,8,10,3049


In [4]:
renamed_map.write.format("delta").mode("overwrite").save("/home/mlops/project/DeltaLake/platinum_data/genre_map_indexed")


In [2]:
genre_map = spark.read.format('delta').load('/home/mlops/project/DeltaLake/platinum_data/genre_map_indexed')
genre_map.to_pandas_on_spark().head()

Unnamed: 0,genres,main_genre_18,main_genre_9,main_genre_9_index,main_genre_18_index,genres_index
0,pop,Pop,Pop,1,0,4115
1,dance pop,Pop,Pop,1,0,1276
2,rap,Rap and Hip-Hop,Rap and Hip-Hop,5,5,4327
3,rock,Rock,Rock and Heavy Metal,0,2,4469
4,latin,Latin American music,Latin American music,8,10,3049
5,pop rap,Pop,Pop,1,0,4137
6,hip hop,Rap and Hip-Hop,Rap and Hip-Hop,5,5,2328
7,trap latino,Latin American music,Latin American music,8,10,5301
8,trap,Rap and Hip-Hop,Rap and Hip-Hop,5,5,5284
9,modern rock,Rock,Rock and Heavy Metal,0,2,3432
