# PySpark

In [2]:
# if doesn't have pymongo yet
!pip install pymongo

Collecting pymongo
  Obtaining dependency information for pymongo from https://files.pythonhosted.org/packages/24/cb/c1824d7c5946c7750a4ce3e2b118b03b88975915f1d060f1f3ec5d9f49d7/pymongo-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pymongo-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Obtaining dependency information for dnspython<3.0.0,>=1.16.0 from https://files.pythonhosted.org/packages/f6/b4/0a9bee52c50f226a3cbfb54263d02bb421c7f2adc136520729c2c689c1e5/dnspython-2.4.2-py3-none-any.whl.metadata
  Downloading dnspython-2.4.2-py3-none-any.whl.metadata (4.9 kB)
Downloading pymongo-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (680 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m680.8/680.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K   [9

In [7]:
import os
from pymongo.mongo_client import MongoClient
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *
import pandas as pd
from datetime import datetime

# IO manger

Spark Session

In [8]:
from contextlib import contextmanager

@contextmanager
def SparkIO(conf: SparkConf = SparkConf()):
    app_name = conf.get("spark.app.name")
    master = conf.get("spark.master")
    print(f'Create SparkSession app {app_name} with {master} mode')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    try:
        yield spark
    finally:
        print(f'Stop SparkSession app {app_name}')
        spark.stop()


In [9]:
from pymongo.errors import ConnectionFailure
from contextlib import contextmanager
import os

@contextmanager
def MongodbIO():
    user = os.getenv("MONGODB_USER")
    password = os.getenv("MONGODB_PASSWORD")
    uri = f"mongodb+srv://{user}:{password}@python.zynpktu.mongodb.net/?retryWrites=true&w=majority"
    try:
        client = MongoClient(uri)
        print(f"MongoDB Connected")
        yield client
    except ConnectionFailure:
        print(f"Failed to connect with MongoDB")
        raise ConnectionFailure
    finally:
        print("Close connection to MongoDB")
        client.close()

## Bronze 

In [10]:
def createSchema(df: pd.DataFrame):
    """This function create Pyspark Schema"""
    field = []

    for col in df.columns:
        dtype = str(df[col].dtype)

        if dtype == 'object':
            field_type = StringType()
        elif 'int' in dtype:
            field_type = IntegerType()
        elif 'bool' in dtype: 
            field_type = BooleanType()
        elif 'float' in dtype: 
            field_type = FloatType()
        elif dtype == 'double':
            field_type = DoubleType()
        else:
            field_type = StringType()
        
        field.append(StructField(col, field_type, True))

    return StructType(field)


def bronze_layer_task(collection, spark: SparkSession, table_name: str) -> None:
    """Extract data from MongoDB to HDFS at bronze layer"""

    hdfs_uri = f"hdfs://namenode:8020/bronze_layer/{table_name}.parquet"
    mongo_data = pd.DataFrame(list(collection.find({}, {"_id": 0}))) # eliminate the _id field

    try:
        spark_data = spark.createDataFrame(mongo_data, schema=mongo_data.columns.tolist())
    except Exception as e:
        print(f"Error to Create Spark DataFrame {table_name} {e}")
        print(f"Start Create Schema for {table_name}")

        schema = createSchema(mongo_data)
        spark_data = spark.createDataFrame(mongo_data, schema=schema)

    print(f"Writing {table_name}")
    spark_data.write.parquet(hdfs_uri, mode="overwrite")
    print(f"Bronze: Successfully push {table_name}.parquet")

def IngestHadoop(spark: SparkSession):
    """Extract data From MongoDb and Load to HDFS"""

    # Connect to MongoDB
    database_name = os.getenv("MONGODB_DATABASE")
    
    with MongodbIO() as client:
        mongo_db = client[database_name] 
        collections = mongo_db.list_collection_names() #get all collectons
    
        #Running task concurrently
        for collection in collections:
            print(f"{collection} start being Ingested...")
            bronze_layer_task(mongo_db[collection], spark, collection) #collection is also the name of table

There is 4 tables:
- artists_data.parquet
- songs_data.parquet
- genres_data.parquet
- albums_data.parquet

location: hdfs://namenode:8020/bronze_layer/{table_name}.parquet

## Silver

### Schema

![Schema](./spotify.png)

Target: 
- using pyspark Cleaning, droping duplicated, drop unusable column (Read [EDA](https://colab.research.google.com/drive/15uM8Uvj1I89zjtJrVn-Z7mvkfyCWo50T?usp=sharing)), format type, there are many duplicated observation.
- join dim artist and dim albums -> join_artist_albums table (clean table before merge) (task 1)
- clean genre, then write back to silver(task 2) -> clean_genre table
- clean songs (task 3) -> clean_songs table (return None)
- The location of silver: hdfs_uri = f"hdfs://namenode:8020/silver_layer/{table_name}.parquet" with table_name is name of result table


Requirements:
- Input of silver main task (spark session), Output: None
- silver main task may have many child tasks, concurrently or sequencially
- Child task input (spark session), any extended params or return base on you, ensure write back result in hdfs with related uri
- Writing (print out) logs every action, handle error and exception (raise it if neccesary)

Dont forget to add your main task to main function !

In [11]:
# Run some code here
def silver_layer_task(spark: SparkSession):
    '''Do some Cleaning tasks for silver layer'''
    # task 1
    # task 2
    # task 3 ...

# Main

In [12]:
def pipeline_B():
    """ELT pipeline with pyspark"""

    conf = (SparkConf().setAppName("ELT-app-{}".format(datetime.today()))
            .setMaster("local[*]"))

    with SparkIO(conf) as spark:
        IngestHadoop(spark) # <----- bronze task
        # add silver tasks here <-------
        

In [13]:
pipeline_B()

Create SparkSession app ELT-app-2023-11-20 10:46:11.575465 with local[*] mode
MongoDB Connected
artists_data start being Ingested...
Writing artists_data
Bronze: Successfully push artists_data.parquet
songs_data start being Ingested...
Error to Create Spark DataFrame songs_data [CANNOT_DETERMINE_TYPE] Some of types cannot be determined after inferring.
Start Create Schema for songs_data
Writing songs_data
Bronze: Successfully push songs_data.parquet
albums_data start being Ingested...
Writing albums_data
Bronze: Successfully push albums_data.parquet
genres_data start being Ingested...
Writing genres_data
Bronze: Successfully push genres_data.parquet
Close connection to MongoDB
Stop SparkSession app ELT-app-2023-11-20 10:46:11.575465
