In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *
import pandas as pd
from datetime import datetime

In [2]:
from contextlib import contextmanager

@contextmanager
def SparkIO(conf: SparkConf = SparkConf()):
    app_name = conf.get("spark.app.name")
    master = conf.get("spark.master")
    print(f'Create SparkSession app {app_name} with {master} mode')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    try:
        yield spark
    finally:
        print(f'Stop SparkSession app {app_name}')
        spark.stop()


In [6]:
conf = (SparkConf().setAppName("ELT-app-{}".format(datetime.today()))
        .set("spark.executor.memory", "2g")
        .setMaster("local[*]"))

with SparkIO(conf) as spark:
    table_name = "songs_data"
    hdfs_uri = f"hdfs://namenode:8020/bronze_layer/{table_name}.parquet"
    df = spark.read.parquet(hdfs_uri, header=True, inferSchema=True)
    df = df.toPandas()

Create SparkSession app ELT-app-2023-11-22 17:42:21.916176 with local[*] mode
Stop SparkSession app ELT-app-2023-11-22 17:42:21.916176


In [7]:
df

Unnamed: 0,song_id,song_name,song_popularity,song_disc_number,song_explicit,song_is_playable,song_track_number,song_release_date,artist_id,album_id,...,song_loudness,song_mode,song_speechiness,song_acousticness,song_instrumentalness,song_liveness,song_valence,song_tempo,song_duration_ms,song_time_signature
0,29V3YjGY3JLBEgk00Z2iGS,Doin' the Sixty-Eight,0,1,False,,10,2019-02-05,7De2eIqeHTw091YeAkkYXV,11f0auoRuQj59A6hgNXCJc,...,-13.285,0.0,0.0594,0.5650,0.00007,0.3430,0.656,103.685997,264200.0,3.0
1,65Ix4a3ViPL0NGA21LdyeF,Reeds and Deeds,0,1,False,,11,2019-02-05,7De2eIqeHTw091YeAkkYXV,11f0auoRuQj59A6hgNXCJc,...,-13.804,1.0,0.0432,0.8110,0.04090,0.1070,0.355,124.863998,320533.0,4.0
2,1BvL9NYVbZvyT9K42LzdYE,Our Love Is Here to Stay,0,1,False,,12,2019-02-05,7De2eIqeHTw091YeAkkYXV,11f0auoRuQj59A6hgNXCJc,...,-14.063,0.0,0.0400,0.4520,0.01410,0.2250,0.470,120.836998,292267.0,4.0
3,38RreIWBv8FbnoCwZRbDiB,Limbo Boat,0,1,False,,13,2019-02-05,7De2eIqeHTw091YeAkkYXV,11f0auoRuQj59A6hgNXCJc,...,-10.894,0.0,0.0449,0.7390,0.24200,0.1290,0.805,200.016998,182480.0,4.0
4,1blaUhguqt3ASJNmUOLa8z,Moon Song,0,1,False,,14,2019-02-05,7De2eIqeHTw091YeAkkYXV,11f0auoRuQj59A6hgNXCJc,...,-13.560,1.0,0.0399,0.7950,0.33100,0.0706,0.462,125.746002,262333.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,6MDJINtgA4ooBJKSphnKE8,Hold On,23,1,False,,8,1986-11-24,6PAt558ZEZl0DmdXlnjMgD,02v9Z7vUiuWUlOlNzNtmPA,...,-5.330,1.0,0.0452,0.0696,0.01580,0.3120,0.586,109.644997,296533.0,4.0
5989,6QN2BvQnsPEKaFtRIQAo46,Miss You,29,1,False,,9,1986-11-24,6PAt558ZEZl0DmdXlnjMgD,02v9Z7vUiuWUlOlNzNtmPA,...,-5.478,1.0,0.0328,0.0246,0.00348,0.2290,0.901,179.483994,306413.0,4.0
5990,6jkTVhK4pduXUiuTs5saUI,Holy Mother,33,1,False,,10,1986-11-24,6PAt558ZEZl0DmdXlnjMgD,02v9Z7vUiuWUlOlNzNtmPA,...,-7.822,1.0,0.0236,0.3640,0.00000,0.1080,0.319,77.797997,295533.0,4.0
5991,3HGGfWcrrFBaeIykKl96Bg,Behind the Mask,39,1,False,,11,1986-11-24,6PAt558ZEZl0DmdXlnjMgD,02v9Z7vUiuWUlOlNzNtmPA,...,-5.381,0.0,0.0270,0.2400,0.00136,0.3650,0.809,118.685997,287800.0,4.0


In [9]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5988    False
5989    False
5990    False
5991    False
5992    False
Length: 5993, dtype: bool