# PySpark

In [1]:
import os
from pymongo.mongo_client import MongoClient
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *
import pandas as pd
from datetime import datetime
from pyspark.sql.types import StructType, StructField, DoubleType, BooleanType, StringType, IntegerType, LongType, ArrayType, DateType

# IO manger

Spark Session

In [2]:
from contextlib import contextmanager

@contextmanager
def SparkIO(conf: SparkConf = SparkConf()):
    app_name = conf.get("spark.app.name")
    master = conf.get("spark.master")
    print(f'Create SparkSession app {app_name} with {master} mode')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    try:
        yield spark
    finally:
        print(f'Stop SparkSession app {app_name}')
        spark.stop()


In [3]:
from pymongo.errors import ConnectionFailure
from contextlib import contextmanager
import os

@contextmanager
def MongodbIO():
    user = os.getenv("MONGODB_USER")
    password = os.getenv("MONGODB_PASSWORD")
    uri = f"mongodb+srv://{user}:{password}@python.zynpktu.mongodb.net/?retryWrites=true&w=majority"
    try:
        client = MongoClient(uri)
        print(f"MongoDB Connected")
        yield client
    except ConnectionFailure:
        print(f"Failed to connect with MongoDB")
        raise ConnectionFailure
    finally:
        print("Close connection to MongoDB")
        client.close()

## Bronze 

In [4]:
def getSchema(table_name):
    """This function create Pyspark Schema"""
    artist_schema = StructType([
    StructField(
        "_id", StringType(), True
    ),
    StructField(
        "external_urls",
        StructType([
            StructField("spotify", StringType(), True)
        ])
    ),
    StructField(
        "followers", 
        StructType([
            StructField("href", StringType(), True),
            StructField("total", IntegerType(), True)
        ])
    ),
    StructField(
        "genres",
        ArrayType(StringType(), True)      
    ),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField(
        "images",
        ArrayType(
            StructType([
                StructField("height", IntegerType(), True),
                StructField("url", StringType(), True),
                StructField("width", IntegerType(), True)
            ])
        )
    ),
    StructField("name", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True)
])

    album_schema = StructType([
   StructField(
        "_id", StringType(), True
    ),
    StructField("album_group", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField(
        "artists",
        ArrayType(
            StructType([
                StructField(
                    "external_urls",
                    StructType([
                        StructField("spotify", StringType(), True)
                    ])
                ),
                StructField("href", StringType(), True),
                StructField("id", StringType(), True),
                StructField("name", StringType(), True),
                StructField("type", StringType(), True),
                StructField("uri", StringType(), True)
            ])
        )
    ),
    StructField(
        "available_markets",
        ArrayType(StringType(), True)
    ),
    StructField(
        "external_urls",
        StructType([
            StructField("spotify", StringType(), True)
        ])
    ),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField(
        "images",
        ArrayType(
            StructType([
                StructField("height", IntegerType(), True),
                StructField("url", StringType(), True),
                StructField("width", IntegerType(), True)
            ])
        )
    ),
    StructField("name", StringType(), True),
    StructField("release_date", StringType(), True),
    StructField("release_date_precision", StringType(), True),
    StructField("total_tracks", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True)
])

    track_schema = StructType([
    StructField(
        "_id", StringType(), True
    ),
    StructField(
        "artists",
        ArrayType(
            StructType([
                StructField(
                    "external_urls",
                    StructType([
                        StructField("spotify", StringType(), True)
                    ])
                ),
                StructField("href", StringType(), True),
                StructField("id", StringType(), True),
                StructField("name", StringType(), True),
                StructField("type", StringType(), True),
                StructField("uri", StringType(), True)
            ])
        )
    ),
    StructField(
        "available_markets", 
        ArrayType(StringType(), True)
    ),
    StructField("disc_number", IntegerType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("explicit", BooleanType(), True),
    StructField(
        "external_urls",
        StructType([
            StructField("spotify", StringType(), True)
        ])
    ),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField("is_local", BooleanType(), True),
    StructField("name", StringType(), True),
    StructField("preview_url", StringType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True)
])

    track_features_schema = StructType([
    StructField(
        "_id", StringType(), True
    ),
    StructField("danceability", DoubleType(), True),
    StructField("energy", DoubleType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", DoubleType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", DoubleType(), True),
    StructField("acousticness", DoubleType(), True),
    StructField("instrumentalness", DoubleType(), True),
    StructField("liveness", DoubleType(), True),
    StructField("valence", DoubleType(), True),
    StructField("tempo", DoubleType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])
    if 'artist' in table_name:
        return artist_schema
    elif 'album' in table_name:
        return album_schema
    elif 'feature' in table_name:
        return track_features_schema
    else:
        return track_schema

In [5]:
def bronze_layer_task(spark: SparkSession, db: str, table_name: str) -> None:
    """Extract data from MongoDB to HDFS at bronze layer"""
    user = os.getenv("MONGODB_USER")
    password = os.getenv("MONGODB_PASSWORD")
    hdfs_uri = f"hdfs://namenode:8020/bronze_layer/{table_name}.parquet"
    uri = f"mongodb+srv://{user}:{password}@python.zynpktu.mongodb.net/?retryWrites=true&w=majority"
    
    spark_data = (spark.read.format("mongodb")
              .schema(getSchema(table_name))
              .option("uri", mongo_uri)
              .option('database', database_name)
              .option('collection', table_name)
              .load()
              .select([col for col in getSchema(table_name).fieldNames() if col != '_id']) # exclude _id field
              )

    print(f"Writing {table_name}")
    try:
        spark_data.write.parquet(hdfs_uri, mode="overwrite")
        print(f"Bronze: Successfully push {table_name}.parquet")
    except Exception as e:
        print(spark_data.printSchema())
        print(e)

def IngestHadoop(spark: SparkSession):
    """Extract data From MongoDb and Load to HDFS"""

    # Connect to MongoDB

    database_name = "remake_spotify_crawling_data"
    # database_name = os.getenv("MONGODB_DATABASE")
    
    
    with MongodbIO() as client:
        mongo_db = client[database_name] 
        collections = mongo_db.list_collection_names() #get all collectons
    
        #Running task concurrently
        for collection in collections:
            print(f"{collection} start being Ingested...")
            bronze_layer_task(spark, database_name, collection) #collection is also the name of table

There is 4 tables:
- artists_data.parquet
- songs_data.parquet
- genres_data.parquet
- albums_data.parquet

location: hdfs://namenode:8020/bronze_layer/{table_name}.parquet

## Silver

### Schema

![Schema](./spotify.png)

Target: 
- using pyspark Cleaning, droping duplicated, drop unusable column (Read [EDA](https://colab.research.google.com/drive/15uM8Uvj1I89zjtJrVn-Z7mvkfyCWo50T?usp=sharing)), format type, there are many duplicated observation.
- join dim artist and dim albums -> join_artist_albums table (clean table before merge) (task 1)
- clean genre, then write back to silver(task 2) -> clean_genre table
- clean songs (task 3) -> clean_songs table (return None)
- The location of silver: hdfs_uri = f"hdfs://namenode:8020/silver_layer/{table_name}.parquet" with table_name is name of result table


Requirements:
- Input of silver main task (spark session), Output: None
- silver main task may have many child tasks, concurrently or sequencially
- Child task input (spark session), any extended params or return base on you, ensure write back result in hdfs with related uri
- Writing (print out) logs every action, handle error and exception (raise it if neccesary)

Dont forget to add your main task to main function !

In [6]:
# Run some code here
def silver_layer_task(spark: SparkSession):
    '''Do some Cleaning tasks for silver layer'''
    # task 1
    # task 2
    # task 3 ...

# Main

In [7]:
def pipeline_B():
    """ELT pipeline with pyspark"""

    user = os.getenv("MONGODB_USER")
    password = os.getenv("MONGODB_PASSWORD")
    uri = f"mongodb+srv://{user}:{password}@python.zynpktu.mongodb.net/?retryWrites=true&w=majority"
    conf = (SparkConf().setAppName("ETL-app-{}".format(datetime.today()))
        .set("spark.executor.memory", "2g")
        .set("spark.mongodb.read.connection.uri",uri)
        .set("spark.mongodb.write.connection.uri", uri)
        .set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1")
        .setMaster("local[*]")
        )

    with SparkIO(conf) as spark:
        IngestHadoop(spark) # <----- bronze task
        # add silver tasks here <-------
        

In [22]:
%%time
pipeline_B()

Create SparkSession app ETL-app-2023-12-02 09:40:29.208405 with local[*] mode
MongoDB Connected
tracks_data start being Ingested...
Close connection to MongoDB
Stop SparkSession app ETL-app-2023-12-02 09:40:29.208405


NameError: name 'mongo_uri' is not defined

In [9]:
user = os.getenv("MONGODB_USER")
password = os.getenv("MONGODB_PASSWORD")
uri = f"mongodb+srv://{user}:{password}@python.zynpktu.mongodb.net/?retryWrites=true&w=majority"

conf = (SparkConf().setAppName("ETL-app-{}".format(datetime.today()))
        .set("spark.executor.memory", "2g")
        .set("spark.mongodb.read.connection.uri",uri)
        .set("spark.mongodb.write.connection.uri", uri)
        .set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1")
        .setMaster("local[*]")
        )

table_name = 'tracks_data'
hdfs_uri = f"hdfs://namenode:8020/bronze_layer/{table_name}.parquet"

spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.read.parquet(hdfs_uri, inferSchema=True)
df.count()



:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4292891-dfb1-4f4a-8d9f-8c11cc64de50;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.2.1 in central
	found org.mongodb#mongodb-driver-sync;4.8.2 in central
	[4.8.2] org.mongodb#mongodb-driver-sync;[4.8.1,4.8.99)
	found org.mongodb#bson;4.8.2 in central
	found org.mongodb#mongodb-driver-core;4.8.2 in central
	found org.mongodb#bson-record-codec;4.8.2 in central
downloading https://repo1.maven.org/maven2/org/mongodb/spark/mongo-spark-connector_2.12/10.2.1/mongo-spark-connector_2.12-10.2.1.jar ...
	[SUCCESSFUL ] org.mongodb.spark#mongo-spark-connector_2.12;10.2.1!mongo-spark-connector_2.12.jar (1255ms)
downloading https://repo1.maven.org/maven2/org/mongodb/mongodb-driver-sync/4.8.2/mongodb-driver-sync-4.8.2.jar ...
	[SU

144543

In [12]:
df.count()

9806

In [11]:
df.printSchema()

root
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- uri: string (nullable = true)
 |-- track_href: string (nullable = true)
 |-- analysis_url: string (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- time_signature: integer (nullable = true)

