In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder.appName("Transaction Processor")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

In [3]:
df = spark.read.parquet("hdfs://hadoop:9000/data/transactions/partitioned")

In [5]:
# Lọc theo ngày cụ thể
df_filtered = df.filter(
    (df["year"] == 2022) &
    (df["month"] == 9) &
    (df["day"] == 1)
)

df_filtered.show()

+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-------------------+------+----+-----+---+
|       Timestamp|From Bank|From Account|To Bank|To Account|Amount Received|Receiving Currency|Amount Paid|Payment Currency|Payment Format|Is Laundering|                 ts|txn_id|year|month|day|
+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-------------------+------+----+-----+---+
|2022/09/01 00:00|        1|   800057A10|     12| 800473570|        1433.68|         US Dollar|    1433.68|       US Dollar|        Cheque|            0|2022-09-01 00:00:00|     1|2022|    9|  1|
|2022/09/01 10:41|     3144|   801DD7F00|   3144| 801DD7F00|        1633.09|         US Dollar|    1633.09|       US Dollar|  Reinvestment|            0|2022-09-01 10:41:00|668210|2022|    9|  1|
|2022/09/01 00:00|  

In [5]:
from pyspark.sql.functions import to_date, date_format

df = df.withColumn("date_str", date_format("ts", "yyyy-MM-dd"))
dates = [row["date_str"] for row in df.select("date_str").distinct().collect()]

for date in dates:
    df_day = df.filter(df["date_str"] == date)
    
    output_path = f"hdfs://hadoop:9000/data/transactions/daily_csv/{date}.csv"
    
    df_day.coalesce(1).write.mode("overwrite") \
        .option("header", True) \
        .csv(output_path)
    
    print(f"✅ Exported CSV for {date} → {output_path}")

✅ Exported CSV for 2022-09-03 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-03.csv
✅ Exported CSV for 2022-09-17 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-17.csv
✅ Exported CSV for 2022-09-15 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-15.csv
✅ Exported CSV for 2022-09-11 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-11.csv
✅ Exported CSV for 2022-09-13 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-13.csv
✅ Exported CSV for 2022-09-16 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-16.csv
✅ Exported CSV for 2022-09-14 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-14.csv
✅ Exported CSV for 2022-09-04 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-04.csv
✅ Exported CSV for 2022-09-12 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-12.csv
✅ Exported CSV for 2022-09-18 → hdfs://hadoop:9000/data/transactions/daily_csv/2022-09-18.csv
✅ Exported CSV for 2022-09-07 → hdfs://hadoop:9000/data/tran