In [1]:
import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder\
    .master("local")\
    .appName("test")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/29 13:39:09 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/29 13:39:09 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/29 13:39:09 WARN util.Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [None]:
source_column_list = "store_type_id, type"
target_column_list = "'store_type_id', 'store_type_name'"
rename_str = ".withColumnRenamed('type', 'store_type_name')"
csv_schema_str = 'StructType([StructField("store_type_id", IntegerType(), False), StructField("type", StringType(), True)])'
csv_schema = eval(csv_schema_str)

In [None]:
bronze_store_type_df = spark.read \
        .option("header", True) \
        .schema(csv_schema) \
        .csv(os.path.join('/', 'datalake', 'bronze', 'dshop_bu', 'store_types', '2022-04-27', 'store_types.csv'))

In [None]:
display(bronze_store_type_df)

In [None]:
bronze_store_type_df.show()

In [None]:
silver_store_type_str = "bronze_store_type_df.where(func.col('type').isNotNull())" \
                                f"{rename_str}.dropDuplicates()" \
                                f".select({target_column_list})"

In [None]:
silver_store_type_df = eval(silver_store_type_str)

In [None]:
display(silver_store_type_df)

In [None]:
silver_store_type_df.show()

In [None]:
source_column_list = "id, fullname, location_area_id"
target_column_list = "'client_id', 'client_name', 'location_area_id'"
rename_str = ".withColumnRenamed('id', 'client_id').withColumnRenamed('fullname', 'client_name')"
csv_schema_str = 'StructType([StructField("id", IntegerType(), False), StructField("fullname", StringType(), True), StructField("location_area_id", IntegerType(), True)])'
csv_schema = eval(csv_schema_str)

In [None]:
bronze_client_df = spark.read \
            .option("header", True) \
            .schema(csv_schema) \
            .csv(os.path.join('/', 'datalake', 'bronze', 'dshop_bu', 'clients', '2022-04-27', 'clients.csv'))

In [None]:
silver_client_str = "bronze_client_df.where(func.col('fullname').isNotNull())" \
                            f"{rename_str}.dropDuplicates()" \
                            f".select({target_column_list})"

In [None]:
silver_client_df = eval(silver_client_str)

In [None]:
display(silver_client_df)

In [None]:
silver_client_df.show()

In [None]:
bronze_aisles_df = spark.read \
        .load(os.path.join('/', 'datalake', 'bronze', 'dshop_bu', 'aisles', '2022-03-04', 'aisles.csv'),
              header="true",
              inferSchema="true",
              format="csv"
              )

In [None]:
silver_aisles_df = bronze_aisles_df \
        .where(func.col('aisle').isNotNull()) \
        .withColumnRenamed('aisle', 'aisle_name') \
        .dropDuplicates()

In [None]:
silver_aisles_df.show()
display(silver_aisles_df)

In [None]:
bronze_departments_df = spark.read \
        .load(os.path.join('/', 'datalake', 'bronze', 'dshop_bu', 'departments', '2022-03-04', 'departments.csv'),
              header="true",
              inferSchema="true",
              format="csv"
              )

In [None]:
silver_departments_df = bronze_departments_df \
        .where(func.col('department').isNotNull()) \
        .withColumnRenamed('department', 'department_name') \
        .dropDuplicates()

In [None]:
silver_departments_df.show()
display(silver_departments_df)

In [50]:
source_column_list = "order_id, product_id, client_id, store_id, quantity, order_date"
target_column_list = "'order_id', 'product_id', 'client_id', 'store_id', 'quantity', 'order_date'"
rename_str = ""
csv_schema_str = 'StructType([StructField("order_id", IntegerType(), True), StructField("product_id", IntegerType(), True), StructField("client_id", IntegerType(), True), StructField("store_id", IntegerType(), True), StructField("quantity", IntegerType(), True), StructField("order_date", DateType(), True)])'
csv_schema = eval(csv_schema_str)
order_date = '2022-04-28'

In [51]:
bronze_order_df = spark.read \
        .option("header", True) \
        .schema(csv_schema) \
        .csv(os.path.join('/', 'datalake', 'bronze', 'dshop_bu', 'orders', '2022-04-29', 'orders.csv'))

In [52]:
display(bronze_order_df)

DataFrame[order_id: int, product_id: int, client_id: int, store_id: int, quantity: int, order_date: date]

In [53]:
bronze_order_df.show()

+--------+----------+---------+--------+--------+----------+
|order_id|product_id|client_id|store_id|quantity|order_date|
+--------+----------+---------+--------+--------+----------+
+--------+----------+---------+--------+--------+----------+



In [54]:
bronze_order_df.count()

0

In [55]:
silver_order_str = "bronze_order_df.where(bronze_order_df.order_id.isNotNull())" \
                           ".where(bronze_order_df.order_date.isNotNull())" \
                           ".where(bronze_order_df.quantity.isNotNull())" \
                           f".where(bronze_order_df.order_date == '{order_date}')" \
                           f"{rename_str}.withColumn('order_month', func.month('order_date'))" \
                           ".dropDuplicates()" \
                           f".select({target_column_list}, 'order_month')"

In [56]:
silver_order_df = eval(silver_order_str)

In [57]:
display(silver_order_df)

DataFrame[order_id: int, product_id: int, client_id: int, store_id: int, quantity: int, order_date: date, order_month: int]

In [58]:
silver_order_df.show()

+--------+----------+---------+--------+--------+----------+-----------+
|order_id|product_id|client_id|store_id|quantity|order_date|order_month|
+--------+----------+---------+--------+--------+----------+-----------+
+--------+----------+---------+--------+--------+----------+-----------+



In [59]:
silver_order_df.write \
            .partitionBy('order_month') \
            .parquet(os.path.join('/', 'datalake', 'silver', 'dshop_bu', 'order')
                     , mode='append')

                                                                                

In [60]:
silver_date_df = silver_order_df \
            .select(silver_order_df.order_date.alias('date')) \
            .distinct()

In [61]:
silver_date_str = "silver_date_df" \
                          f".where(silver_date_df.date == '{order_date}')" \
                          ".select(silver_date_df.date, " \
                          "func.dayofmonth('date').alias('day'), " \
                          "func.month('date').alias('month'), " \
                          "func.quarter('date').alias('quarter'), " \
                          "func.year('date').alias('year'), " \
                          "func.dayofweek('date').alias('week_day'), " \
                          "func.weekofyear('date').alias('week') )"

In [62]:
silver_date_df = eval(silver_date_str)

In [63]:
silver_date_df.show()

                                                                                

+----+---+-----+-------+----+--------+----+
|date|day|month|quarter|year|week_day|week|
+----+---+-----+-------+----+--------+----+
+----+---+-----+-------+----+--------+----+



In [64]:
silver_date_df.write \
            .parquet(os.path.join('/', 'datalake', 'silver', 'dshop_bu', 'date')
                     , mode='append')

                                                                                

In [65]:
date_df = spark.read.parquet(os.path.join('/', 'datalake', 'silver', 'dshop_bu', 'date'))

In [66]:
date_df.count()

120

In [67]:
order_df = spark.read.parquet(os.path.join('/', 'datalake', 'silver', 'dshop_bu', 'order'))

In [68]:
order_df.count()

                                                                                

8112551

In [69]:
order_df.show()

+--------+----------+---------+--------+--------+----------+-----------+
|order_id|product_id|client_id|store_id|quantity|order_date|order_month|
+--------+----------+---------+--------+--------+----------+-----------+
|  197879|     27427|      907|      28|       1|2021-03-01|          3|
|  197918|      8294|     1064|       9|       1|2021-03-01|          3|
|  197946|      6137|      266|      25|       1|2021-03-01|          3|
|  197953|     33561|       44|      19|       1|2021-03-01|          3|
|  197953|     45870|       44|      19|       1|2021-03-01|          3|
|  198007|     28701|      116|      24|       1|2021-03-01|          3|
|  198019|      5466|      204|      22|       1|2021-03-01|          3|
|  198175|     22209|      189|      32|       1|2021-03-01|          3|
|  198179|     15190|     1304|       2|       1|2021-03-01|          3|
|  198220|     22511|      778|       7|       1|2021-03-01|          3|
|  198262|     44792|      458|      22|       1|20

In [29]:
order_gold_df = order_df \
            .where(order_df.order_date == order_date) \
            .select('order_id', 'order_month', 'order_date', 'store_id', 'client_id', 'product_id', 'quantity')

In [30]:
order_gold_df.count()

                                                                                

8112551

In [31]:
order_gold_df.first()

Row(order_id=197879, order_month=3, order_date=datetime.date(2021, 3, 1), store_id=28, client_id=907, product_id=27427, quantity=1)

In [32]:
gp_url = "jdbc:postgresql://192.168.1.56:5433/data_mart"
gp_properties = {"user": "gpuser", "password": "secret"}

In [33]:
order_gold_df.write \
            .jdbc(gp_url
                  , table="public.fact_order"
                  , properties=gp_properties
                  , mode="overwrite")

Py4JJavaError: An error occurred while calling o170.jdbc.
: java.sql.SQLException: No suitable driver
	at java.sql.DriverManager.getDriver(DriverManager.java:315)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$2(JDBCOptions.scala:108)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:108)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:215)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:219)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:45)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:301)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:817)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
bronze_oos_df = spark.read \
        .load(os.path.join('/', 'datalake', 'bronze', 'rd_payload', '2021-07-06', 'api_values.json'),
              header="true",
              inferSchema="true",
              format="json"
              )

In [None]:
display(bronze_oos_df)

In [None]:
bronze_oos_df.count()

In [None]:
silver_oos_df = bronze_oos_df \
            .withColumn('date', func.col('date').cast('date')) \
            .withColumnRenamed('date', 'process_date') \
            .dropDuplicates()

In [None]:
display(silver_oos_df)

In [None]:
silver_oos_df.first()

In [None]:
silver_oos_df.count()

In [70]:
oos_df = spark.read.parquet(os.path.join('/', 'datalake', 'silver', 'rd_payload'))

In [71]:
oos_df.where(oos_df.process_date == '2021-07-05').show()

+----------+------------+
|product_id|process_date|
+----------+------------+
|      1161|  2021-07-05|
|     22924|  2021-07-05|
|      4964|  2021-07-05|
|      5255|  2021-07-05|
|     31303|  2021-07-05|
|     45621|  2021-07-05|
|      3738|  2021-07-05|
|     21412|  2021-07-05|
|     37544|  2021-07-05|
|      5644|  2021-07-05|
|       616|  2021-07-05|
|     28932|  2021-07-05|
|     27794|  2021-07-05|
|     46493|  2021-07-05|
|     25390|  2021-07-05|
|      1161|  2021-07-05|
|     22924|  2021-07-05|
|      4964|  2021-07-05|
|      5255|  2021-07-05|
|     31303|  2021-07-05|
+----------+------------+
only showing top 20 rows



In [73]:
oos_df.where(oos_df.process_date == '2021-07-05').distinct().count()

                                                                                

1532

In [74]:
oos_df.where(oos_df.process_date == '2021-07-06').show()

+----------+------------+
|product_id|process_date|
+----------+------------+
|     29268|  2021-07-06|
|     22077|  2021-07-06|
|     16719|  2021-07-06|
|     16061|  2021-07-06|
|     23007|  2021-07-06|
|     27394|  2021-07-06|
|     26444|  2021-07-06|
|     28396|  2021-07-06|
|     33296|  2021-07-06|
|     21584|  2021-07-06|
|     43107|  2021-07-06|
|     12785|  2021-07-06|
|      5292|  2021-07-06|
|       277|  2021-07-06|
|     20342|  2021-07-06|
|      5543|  2021-07-06|
|     45254|  2021-07-06|
|     44475|  2021-07-06|
|      9712|  2021-07-06|
|     29268|  2021-07-06|
+----------+------------+
only showing top 20 rows



In [76]:
oos_df.where(oos_df.process_date == '2021-07-06').distinct().count()

                                                                                

1762

In [77]:
oos_df.count()

                                                                                

9391

In [34]:
bronze_currency_df = spark.read \
        .load(os.path.join('/', 'datalake', 'bronze', 'currencies', '2022-04-29', 'RUB_EUR.json'),
              header="true",
              inferSchema="true",
              format="json",
              multiline="true"
              )

In [40]:
currency = 'RUB'
process_date = '2022-04-29'

In [36]:
silver_currency_df = bronze_currency_df \
                .withColumn('rates.RUB', func.col('rates.RUB').cast('double')) \
                .withColumnRenamed('rates.RUB', 'rate')

In [37]:
silver_currency_df = silver_currency_df \
            .withColumn('currency', func.lit(currency)) \
            .withColumn('rate_month', func.month('date')) \
            .withColumn('date', func.col('date').cast('date')) \
            .withColumnRenamed('base', 'currency_base') \
            .withColumnRenamed('date', 'rate_date') \
            .drop('success') \
            .drop('timestamp') \
            .drop('rates')

In [41]:
silver_currency_df = silver_currency_df \
            .where(silver_currency_df.rate_date == process_date) \
            .dropDuplicates() \
            .select('currency_base', 'currency', 'rate_date', 'rate_month', 'rate')

In [42]:
display(silver_currency_df)

DataFrame[currency_base: string, currency: string, rate_date: date, rate_month: int, rate: double]

In [43]:
silver_currency_df.count()



1

In [44]:
silver_currency_df.show()

+-------------+--------+----------+----------+---------+
|currency_base|currency| rate_date|rate_month|     rate|
+-------------+--------+----------+----------+---------+
|          EUR|     RUB|2022-04-29|         4|75.295908|
+-------------+--------+----------+----------+---------+



In [None]:
silver_currency_df.write \
        .partitionBy('rate_month') \
        .parquet(os.path.join('/', 'datalake', 'silver', 'currencies', 'RUB_EUR')
                 , mode='append')

In [45]:
currency_silver_df = spark.read.parquet(os.path.join('/', 'datalake', 'silver', 'currencies', 'RUB_EUR'))

In [46]:
process_date = '2022-04-29'

In [48]:
currency_silver_df = currency_silver_df \
                .where(currency_silver_df.rate_date == process_date) \
                .dropDuplicates() \
                .select('currency_base', 'currency', 'rate_date', 'rate_month', 'rate')

In [49]:
currency_silver_df.show()

+-------------+--------+----------+----------+---------+
|currency_base|currency| rate_date|rate_month|     rate|
+-------------+--------+----------+----------+---------+
|          EUR|     RUB|2022-04-29|         4|75.295908|
+-------------+--------+----------+----------+---------+



In [None]:
display(currency_silver_df)

In [None]:
currency_silver_df.write \
            .partitionBy('rate_month') \
            .parquet(os.path.join('/', 'datalake', 'gold', 'currencies', 'currency')
                     , mode='append')

In [None]:
display(currency_RUB_df)

In [None]:
currency_RUB_df.show(100)

In [None]:
currency_df = currency_RUB_df.where(currency_RUB_df.rate_date == '2022-04-27')

In [None]:
currency_df = currency_df.union(currency_USD_df.where(currency_USD_df.rate_date == '2022-04-27'))

In [None]:
currency_df = currency_df.union(currency_GBP_df.where(currency_GBP_df.rate_date == '2022-04-27'))

In [None]:
currency_df = currency_df.union(currency_PLN_df.where(currency_PLN_df.rate_date == '2022-04-27'))

In [None]:
currency_df = currency_df.union(currency_UAH_df.where(currency_UAH_df.rate_date == '2022-04-27'))

In [None]:
display(currency_df)

In [None]:
currency_df.show(100)

In [None]:
gp_url = "jdbc:postgresql://192.168.0.16:5433/data_mart"
gp_properties = {"user": "gpuser", "password": "secret"}

In [None]:
currency_df.write \
               .jdbc(gp_url, table="public.fact_currency_rate", properties=gp_properties, mode="overwrite")

In [None]:
currency_df.write.jdbc(gp_url
                   , table = 'public.fact_currency_rate'
                   , properties = gp_properties
                   , mode = 'overwrite')