In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

import os
import shutil

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("RDD_Analysis") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc

In [4]:
FILE_NAME = "work/fhvhv_tripdata_2024_1278/fhvhv_tripdata_2024-01.parquet"

df = spark.read.parquet(FILE_NAME)
rdd = df.rdd.sample(withReplacement=False, fraction=0.001, seed=42)

In [5]:
reduced_rdd = rdd.map(
    lambda row: dict(
        request_datetime=row.request_datetime,
        trip_miles=row.trip_miles,
        trip_time=row.trip_time,
        base_passenger_fare=row.base_passenger_fare
    )
)

In [6]:
filtered_rdd = reduced_rdd.filter(lambda row: row['request_datetime'].year == 2024 and row['base_passenger_fare'] > 0)

filtered_rdd = filtered_rdd.cache()

total_revenue_count = filtered_rdd.map(
    lambda row: (row['base_passenger_fare'], 1)
).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))


In [7]:
total_revenue, total_count = total_revenue_count

average_revenue = total_revenue / total_count
average_revenue = round(average_revenue, 2)

total_distance = filtered_rdd.map(lambda row: row['trip_miles']).reduce(lambda x, y: x + y)

average_distance = total_distance / total_count
average_distance = round(average_distance, 2)

In [8]:
daily_metrics_rdd = filtered_rdd.map(
    lambda row: (
        row['request_datetime'].date(),  # key: 날짜
        (1, row['base_passenger_fare'])  # value: (건수, 수익)
    )
).reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

daily_metrics_sorted = daily_metrics_rdd.sortByKey()

summary_rdd = sc.parallelize([
    "total_revenue,total_count,average_distance",
    f"{round(total_revenue_count[0], 2)},{total_revenue_count[1]},{round(average_distance, 2)}"
])

In [9]:
# 저장
summary_rdd.coalesce(1).saveAsTextFile("output/summary")

daily_csv_rdd = daily_metrics_sorted.map(
    lambda x: f"{x[0]},{x[1][0]},{round(x[1][1], 2)}"
)

# 하나의 파일로 저장하여 순서 보장
daily_csv_rdd.coalesce(1).saveAsTextFile("output/daily_metrics")

In [10]:
sc