In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder.appName("Transaction Processor")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

In [3]:
df = spark.read.parquet("hdfs://hadoop:9000/data/transactions/partitioned")

In [4]:
# Lọc theo ngày cụ thể
df_filtered = df.filter(
    (df["year"] == 2022) &
    (df["month"] == 9) &
    (df["day"] == 1)
)

df_filtered.show()

+----------------+---------+------------+-------+----------+---------------+------------------+--------------+----------------+--------------+-------------+-------------------+------+------+---------+----+-----+---+
|       Timestamp|From Bank|From Account|To Bank|To Account|Amount Received|Receiving Currency|   Amount Paid|Payment Currency|Payment Format|Is Laundering|                 ts|txn_id|  Bank|  Account|year|month|day|
+----------------+---------+------------+-------+----------+---------------+------------------+--------------+----------------+--------------+-------------+-------------------+------+------+---------+----+-----+---+
|2022/09/01 00:00|        1|   8001364A0|      1| 8001364A0|           10.3|         US Dollar|          10.3|       US Dollar|  Reinvestment|            0|2022-09-01 00:00:00|     1|     1|8001364A0|2022|    9|  1|
|2022/09/01 00:00|        1|   800139380|      1| 800139380|          18.51|         US Dollar|         18.51|       US Dollar|  Reinves

In [5]:
from pyspark.sql.functions import to_date, date_format

df = df.withColumn("date_str", date_format("ts", "yyyy-MM-dd"))
dates = [row["date_str"] for row in df.select("date_str").distinct().collect()]

for date in dates:
    df_day = df.filter(df["date_str"] == date)
    
    output_path = f"hdfs://hadoop:9000/data/transactions/daily_csv/{date}.csv"
    
    df_day.coalesce(1).write.mode("overwrite") \
        .option("header", True) \
        .csv(output_path)
    
    print(f"✅ Exported CSV for {date} → {output_path}")

✅ Exported CSV for 2022-09-06 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-06.csv
✅ Exported CSV for 2022-09-09 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-09.csv
✅ Exported CSV for 2022-09-05 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-05.csv
✅ Exported CSV for 2022-09-01 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-01.csv
✅ Exported CSV for 2022-09-02 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-02.csv
✅ Exported CSV for 2022-09-03 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-03.csv
✅ Exported CSV for 2022-09-07 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-07.csv
✅ Exported CSV for 2022-09-11 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-11.csv
✅ Exported CSV for 2022-09-08 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-08.csv
✅ Exported CSV for 2022-09-10 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-10.csv
✅ Exported CSV for 2022-09-04 → hdfs://hadoop:9000/data/tran

In [6]:
spark.stop()