In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *
import pandas as pd
from datetime import datetime

In [2]:
from contextlib import contextmanager

@contextmanager
def SparkIO(conf: SparkConf = SparkConf()):
    app_name = conf.get("spark.app.name")
    master = conf.get("spark.master")
    print(f'Create SparkSession app {app_name} with {master} mode')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    try:
        yield spark
    finally:
        print(f'Stop SparkSession app {app_name}')
        spark.stop()


In [5]:
table_names = ["albums_data", "artists_data", "tracks_data", "tracks_features_data"]

In [6]:
conf = (SparkConf().setAppName("ELT-app-{}".format(datetime.today()))
        .set("spark.executor.memory", "2g")
        .setMaster("local[*]"))

with SparkIO(conf) as spark:
    hdfs_uri1 = f"hdfs://namenode:8020/bronze_layer/{table_names[0]}.parquet"
    hdfs_uri2 = f"hdfs://namenode:8020/bronze_layer/{table_names[1]}.parquet"
    hdfs_uri3 = f"hdfs://namenode:8020/bronze_layer/{table_names[2]}.parquet"
    hdfs_uri4 = f"hdfs://namenode:8020/bronze_layer/{table_names[3]}.parquet"
    albums = spark.read.parquet(hdfs_uri1, header=True, inferSchema=True).toPandas()
    artist = spark.read.parquet(hdfs_uri2, header=True, inferSchema=True).toPandas()
    track = spark.read.parquet(hdfs_uri3, header=True, inferSchema=True).toPandas()
    track_feat = spark.read.parquet(hdfs_uri4, header=True, inferSchema=True).toPandas()

Create SparkSession app ELT-app-2023-11-27 08:27:58.462619 with local[*] mode
Stop SparkSession app ELT-app-2023-11-27 08:27:58.462619


In [22]:
albums.dtypes

album_group               object
album_type                object
artists                   object
available_markets         object
external_urls             object
href                      object
id                        object
images                    object
name                      object
release_date              object
release_date_precision    object
total_tracks               int64
type                      object
uri                       object
dtype: object

requirements: silver layer:
1. tách cột đưa bảng về dạng chuẩn tất cả các bảng
2. drop columns không cần thiếc
3. format type (spark)

In [28]:
albums.head(1)

Unnamed: 0,album_group,album_type,artists,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,total_tracks,type,uri
0,album,album,"[{'name': None, 'id': None, 'href': None, 'typ...","[AR, AU, AT, BE, BO, BR, BG, CL, CO, CR, CY, C...",{'spotify': 'https://open.spotify.com/album/6I...,https://api.spotify.com/v1/albums/6IfrO26rrFYA...,6IfrO26rrFYAEbKLPjYZF0,"[{'width': 640, 'url': None, 'height': 640}, {...","Live in Tokyo, Japan '99",2020-11-20,day,21,album,spotify:album:6IfrO26rrFYAEbKLPjYZF0
