In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col
from datetime import datetime, timedelta
import os
import subprocess

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Daily Dump Generation") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
# Define the SQL query
sql_query = """
    SELECT 
        a.sales_person_name AS sales_agent_name, 
        p.product_name, 
        SUM(t.units) AS total_sold_units
    FROM 
        BigData_DWH.Transactions_Fact_table t
    JOIN 
        BigData_DWH.sales_agents_dimension a ON t.sales_person_id = a.sales_person_id
    JOIN 
        BigData_DWH.products_dimension p ON t.product_id = p.product_id
    GROUP BY 
        a.sales_person_name, 
        p.product_name, 
        t.sales_person_id, 
        t.product_id
"""

# Execute the query and store the result in a DataFrame
result_df = spark.sql(sql_query)

# Drop duplicates
distinct_df = result_df.distinct()

# Show results
distinct_df.show()


+------------------+---------------+----------------+
|  sales_agent_name|   product_name|total_sold_units|
+------------------+---------------+----------------+
|      Olivia Davis|        Monitor|            3744|
|Christopher Miller|         Hoodie|            1848|
|         john wick|     Headphones|             800|
|      David Wilson|     Smartphone|            5152|
|   Michael Johnson|          Skirt|            1700|
|   Michael Johnson|Washing Machine|            4940|
|      David Wilson|         Hoodie|            3432|
|   Daniel Martinez|          Heels|            3800|
|      Sophia Moore|         Blouse|            4536|
|      Olivia Davis|         Laptop|            2538|
|   Daniel Martinez|          Jeans|            3248|
|       Emma Taylor| Vacuum Cleaner|            4116|
|   Michael Johnson|          Jeans|            2688|
|         john wick|         Blouse|            3456|
|       Emily Brown|   Coffee Maker|            8640|
|        Jane Smith|Electric

In [4]:
previous_day = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")

In [5]:
output_hdfs_path = f"/user/itversity/daily_dump/{previous_day}"

In [6]:
# Write the DataFrame to CSV in HDFS
distinct_df.coalesce(1).write.mode("overwrite").csv(output_hdfs_path, header=True)

In [7]:
local_output_path = "daily_dump"

# Copy the directory from HDFS to the local file system
copy_command = ['hadoop', 'fs', '-get', '-f', output_hdfs_path, local_output_path]

try:
    copy_result = subprocess.run(copy_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    print(copy_result.stdout.decode())
    print(f"Directory '{output_hdfs_path}' copied successfully to '{local_output_path}'.")
except subprocess.CalledProcessError as e:
    print(f"Error copying directory: {e.stderr.decode()}")


Directory '/user/itversity/daily_dump/2024-07-05' copied successfully to 'daily_dump'.


In [8]:
# Stop Spark session
spark.stop()