In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, to_date, dayofweek, when, count, make_date, regexp_replace, round, floor
from pyspark.sql.functions import year, month, dayofmonth

In [2]:
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()
spark

In [3]:
FILE_NAME_HEADER = "work/fhvhv_tripdata_2024_1278/fhvhv_tripdata_2024-01"

In [4]:
path = f"{FILE_NAME_HEADER}.parquet"
df = spark.read.parquet(path)


In [5]:
columns = [
    'request_datetime',
    'trip_miles',
    'trip_time',
    'base_passenger_fare'
]

reduced_df = df.select(*columns)
# reduced_df.show(5)

In [6]:
# reduced_df.printSchema()

In [7]:
filtered_df = reduced_df.filter((col('base_passenger_fare') > 0))

filtered_df = filtered_df.withColumn('date', to_date(col('request_datetime'), 'yyyy-MM-dd'))
filtered_df = filtered_df.drop('request_datetime')

# Select 2024-01 datas
filtered_df = filtered_df.filter(year(col('date')) == 2024).filter(month(col('date')) == 1)

In [8]:
# 날씨 데이터 로드
weather_path_header = "work/2024_weather/"
weather_df = spark.read.option("header", True).csv(weather_path_header + "01.csv")

In [9]:
weather_df = weather_df.withColumn("date", make_date("year", "month", "day"))
weather_df = weather_df.drop("year", "month", "day")

for col_name in ['precipitation1', 'precipitation2', 'precipitation3']:
    weather_df = weather_df.withColumn(
        col_name,
        regexp_replace(col(col_name), 'T', '0')
    )

# 타입 변경
for col_name in ['precipitation1', 'precipitation2', 'precipitation3']:
    weather_df = weather_df.withColumn(
        col_name,
        col(col_name).cast('float')
    )

# precipitation 합산
weather_df = weather_df.withColumn(
    "precipitation",
    round(col("precipitation1") + col("precipitation2") + col("precipitation3"), 2)
)

weather_df = weather_df.drop("precipitation1", "precipitation2", "precipitation3")

In [10]:
weather_df = weather_df \
    .withColumn("max", round((col("max") - 32) * 5 / 9, 1).alias("max")) \
    .withColumn("min", round((col("min") - 32) * 5 / 9, 1).alias("min")) \

df_with_weather = filtered_df.join(weather_df, on="date", how="left")

In [11]:
df_with_day = df_with_weather.withColumn("date", to_date(col("date"))) \
    .withColumn("day_of_week", dayofweek(col("date"))) \
    .withColumn("day_type", when(col("day_of_week").isin(1, 7), "weekend").otherwise("weekday"))

In [12]:
# df_with_day = df_with_day.cache()

In [13]:
# 평일/주말 기준 집계
result1 = df_with_day.groupBy("day_type").agg(
    count("*").alias("count"),
    avg("trip_miles").alias("avg_trip_miles"),
    avg("base_passenger_fare").alias("avg_base_passenger_fare")
)

In [14]:
result1.show(5)

+--------+--------+-----------------+-----------------------+
|day_type|   count|   avg_trip_miles|avg_base_passenger_fare|
+--------+--------+-----------------+-----------------------+
| weekday|14022215|4.852969220483359|       24.4602157754583|
| weekend| 5633520|4.802974764268159|      22.74219542665929|
+--------+--------+-----------------+-----------------------+



In [15]:
df_with_day = df_with_day.withColumn("temp_group", floor(col("max") / 5) * 5)

result2 = df_with_day.groupBy("temp_group").agg(
    count("*").alias("count"),
    round(avg("trip_miles"), 2).alias("avg_trip_miles"),
    round(avg("base_passenger_fare"), 2).alias("avg_base_passenger_fare")
).orderBy("temp_group")

In [16]:
result2.show(5)

+----------+-------+--------------+-----------------------+
|temp_group|  count|avg_trip_miles|avg_base_passenger_fare|
+----------+-------+--------------+-----------------------+
|        -5|2739250|          4.78|                  23.07|
|         0|6849524|          4.77|                  23.73|
|         5|6783633|          4.98|                  24.49|
|        10|1877170|          4.69|                  23.95|
|        15|1406158|          4.82|                  24.37|
+----------+-------+--------------+-----------------------+

