In [1]:
import findspark
import os

os.environ['SPARK_HOME'] = '/opt/homebrew/Cellar/apache-spark/4.0.0/libexec'
os.environ['JAVA_HOME'] = '/opt/homebrew/Cellar/openjdk@17/17.0.16/'

findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("m2spark") \
    .master("local[*]") \
    .config('spark.driver.host', '127.0.0.1') \
    .config('spark.executor.memory', '16g') \
    .config('spark.executor.cores', '8') \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/30 17:29:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark

# 1. 데이터 로딩 및 샘플링

In [5]:
from pyspark.sql.functions import col, unix_timestamp, expr, rand, mean, round

FILE_NAME_HEADER = "shared/data/fhv_tripdata_2024_1278/fhvhv_tripdata_2024-"
months = ['01', '01', '07', '08']
fraction = 0.1
sampled_df_all = None

In [6]:
for month in months:
    file_path = f"{FILE_NAME_HEADER}{month}.parquet"
    df = spark.read.parquet(file_path)
    
    sampled_df = df.sample(False, fraction, seed=42)
    
    if sampled_df_all is None:
        sampled_df_all = sampled_df
    else:
        sampled_df_all = sampled_df_all.union(sampled_df)

In [7]:
# sampled_df_all.show()

In [8]:
# 2. 불필요한 열 제거

columns_to_drop = [
    'dispatching_base_num', 'originating_base_num', 'shared_request_flag',
    'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'
]
df_clean = sampled_df_all.drop(*columns_to_drop)

# 3. 시간 조건 필터링
df_clean = df_clean \
    .withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
    .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime")) \
    .withColumn("scene_ts", unix_timestamp("on_scene_datetime")) \
    .withColumn("request_ts", unix_timestamp("request_datetime"))

df_clean = df_clean.filter((col("dropoff_ts") > col("pickup_ts")) &
                           (col("scene_ts") > col("request_ts")) &
                           (col("base_passenger_fare") > 0) &
                           (col("driver_pay") >= 0))

# scene_time 계산
df_clean = df_clean.withColumn("scene_time", expr("pickup_ts - scene_ts"))

# ts 열 제거
df_clean = df_clean.drop("pickup_ts", "dropoff_ts", "scene_ts", "request_ts")

# 분단위 시간 변환
# 반올림
df_clean = df_clean.withColumn("scene_time", round(expr("scene_time / 60")))
df_clean = df_clean.withColumn("trip_time", round(expr("trip_time / 60")))


In [9]:
# df_clean.show(5)

In [10]:
# 날짜 정보 추출
from pyspark.sql.functions import year, month, dayofmonth, hour
df_with_date = df_clean \
    .withColumn("year", year("request_datetime")) \
    .withColumn("month", month("request_datetime")) \
    .withColumn("day", dayofmonth("request_datetime")) \
    .withColumn("hour", hour("request_datetime"))

# 계절 정보 추가
df_with_date = df_with_date.withColumn(
    "season",
    expr("""
        CASE
            WHEN month = 1 OR month = 2 THEN 'Winter'
            WHEN month = 7 OR month = 8 THEN 'Summer'
        END
    """)
)

# 택시 라이센스 타입 추가
df_with_date = df_with_date.withColumn(
    "service_type",
    expr("""
        CASE
            WHEN hvfhs_license_num='HV0003' THEN 'Uber'
            WHEN hvfhs_license_num='HV0004' THEN 'Lyft'
        END
    """)
)

In [11]:
# df_with_date.show(5)

# 4. 지역 정보 로드 및 조인 (CSV -> Spark DataFrame)

In [12]:
taxi_zone = spark.read.option("header", True).csv(
    "/Users/admin/softeer_de_wiki/mission/W4/m2/shared/data/taxi_zone_lookup.csv"
)

from pyspark.sql.functions import col

# 1. PULocationID에 대한 join
# df_with_date와 taxi_zone에 각각 별칭(alias)을 부여합니다.
pu_zones = taxi_zone.alias("pu_zones")

df_with_pu = df_with_date.join(
    pu_zones,
    df_with_date.PULocationID == col("pu_zones.LocationID"),
    how="left"
).select(
    df_with_date["*"],
    col("pu_zones.Borough").alias("PULocation") # 별칭을 사용해 명확히 지정
)

# 2. DOLocationID에 대한 join
# taxi_zone에 다시 새로운 별칭을 부여합니다.
do_zones = taxi_zone.alias("do_zones")

df_with_do = df_with_pu.join(
    do_zones,
    df_with_pu.DOLocationID == col("do_zones.LocationID"),
    how="left"
).select(
    df_with_pu["*"],
    col("do_zones.Borough").alias("DOLocation") # 별칭을 사용해 명확히 지정
)

# drop locationid
df_zone = df_with_do.drop("PULocationID", "DOLocationID", "LocationID")

In [13]:
# df_zone.show(5)

# 5. 분석 또는 저장용 처리

In [14]:
# df_zone.cache()  # 후속 분석을 위해 캐싱

# 날씨 데이터 처리

In [15]:
# 날씨 데이터 로드
weather_path_header = "/Users/admin/softeer_de_wiki/mission/W4/m2/shared/data/2024_weather/"
weather_df = spark.read.option("header", True).csv(weather_path_header + "*.csv")

25/07/30 17:29:12 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /Users/admin/softeer_de_wiki/mission/W4/m2/shared/data/2024_weather/*.csv.
java.io.FileNotFoundException: File /Users/admin/softeer_de_wiki/mission/W4/m2/shared/data/2024_weather/*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSou

In [16]:
from pyspark.sql.functions import col, regexp_replace, sum as spark_sum, round, expr
from pyspark.sql.functions import year, month, dayofmonth

for col_name in ['precipitation1', 'precipitation2', 'precipitation3']:
    weather_df = weather_df.withColumn(col_name, regexp_replace(col(col_name), 'T', '0').cast("float"))

weather_df = weather_df.withColumn(
    "precipitation",
    col("precipitation1") + col("precipitation2") + col("precipitation3")
)

weather_df = weather_df \
    .withColumn("max", round((col("max") - 32) * 5 / 9, 1)) \
    .withColumn("min", round((col("min") - 32) * 5 / 9, 1))

weather_df = weather_df.drop("precipitation1", "precipitation2", "precipitation3")


df_final = df_zone.join(
    weather_df,
    on=["year", "month", "day"],
    how="left"
)

In [17]:
# df_zone.show(5)

In [18]:
df_final.select("year", "month", "hour", "season", "precipitation", "max_C", "min_C").show(5)

25/07/30 17:29:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
{"ts": "2025-07-30 17:29:12.392", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `max_C` cannot be resolved. Did you mean one of the following? [`max`, `bcf`, `day`, `min`, `hour`]. SQLSTATE: 42703", "context": {"file": "jdk.internal.reflect.GeneratedMethodAccessor25.invoke(Unknown Source)", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o238.select.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `max_C` cannot be resolved. Did you mean one of the following? [`max`, `bcf`, `day`, `min`, `hour`]. SQ

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `max_C` cannot be resolved. Did you mean one of the following? [`max`, `bcf`, `day`, `min`, `hour`]. SQLSTATE: 42703;
'Project [year#103, month#104, hour#106, season#107, precipitation#167, 'max_C, 'min_C]
+- Project [year#103, month#104, day#105, hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, hour#106, season#107, service_type#108, PULocation#131, DOLocation#137, max#168, ... 2 more fields]
   +- Join LeftOuter, (((cast(year#103 as bigint) = cast(year#155 as bigint)) AND (cast(month#104 as bigint) = cast(month#156 as bigint))) AND (cast(day#105 as bigint) = cast(day#157 as bigint)))
      :- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour#106, season#107, service_type#108, PULocation#131, DOLocation#137]
      :  +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour#106, season#107, service_type#108, PULocation#131, ... 1 more fields]
      :     +- Join LeftOuter, (cast(DOLocationID#8 as bigint) = cast(LocationID#132 as bigint))
      :        :- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour#106, season#107, service_type#108, Borough#127 AS PULocation#131]
      :        :  +- Join LeftOuter, (cast(PULocationID#7 as bigint) = cast(LocationID#126 as bigint))
      :        :     :- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour#106, season#107, CASE WHEN (hvfhs_license_num#0 = HV0003) THEN Uber WHEN (hvfhs_license_num#0 = HV0004) THEN Lyft END AS service_type#108]
      :        :     :  +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour#106, CASE WHEN ((month#104 = 1) OR (month#104 = 2)) THEN Winter WHEN ((month#104 = 7) OR (month#104 = 8)) THEN Summer END AS season#107]
      :        :     :     +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, day#105, hour(request_datetime#3, Some(Asia/Seoul)) AS hour#106]
      :        :     :        +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month#104, dayofmonth(cast(request_datetime#3 as date)) AS day#105]
      :        :     :           +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year#103, month(cast(request_datetime#3 as date)) AS month#104]
      :        :     :              +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101, year(cast(request_datetime#3 as date)) AS year#103]
      :        :     :                 +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, round((cast(trip_time#10L as double) / cast(60 as double)), 0) AS trip_time#102, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#101]
      :        :     :                    +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, round((cast(scene_time#100L as double) / cast(60 as double)), 0) AS scene_time#101]
      :        :     :                       +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, scene_time#100L]
      :        :     :                          +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, pickup_ts#96L, dropoff_ts#97L, scene_ts#98L, request_ts#99L, (pickup_ts#96L - scene_ts#98L) AS scene_time#100L]
      :        :     :                             +- Filter ((((dropoff_ts#97L > pickup_ts#96L) AND (scene_ts#98L > request_ts#99L)) AND (base_passenger_fare#11 > cast(0 as double))) AND (driver_pay#18 >= cast(0 as double)))
      :        :     :                                +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, pickup_ts#96L, dropoff_ts#97L, scene_ts#98L, unix_timestamp(request_datetime#3, yyyy-MM-dd HH:mm:ss, Some(Asia/Seoul), true) AS request_ts#99L]
      :        :     :                                   +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, pickup_ts#96L, dropoff_ts#97L, unix_timestamp(on_scene_datetime#4, yyyy-MM-dd HH:mm:ss, Some(Asia/Seoul), true) AS scene_ts#98L]
      :        :     :                                      +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, pickup_ts#96L, unix_timestamp(dropoff_datetime#6, yyyy-MM-dd HH:mm:ss, Some(Asia/Seoul), true) AS dropoff_ts#97L]
      :        :     :                                         +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18, unix_timestamp(pickup_datetime#5, yyyy-MM-dd HH:mm:ss, Some(Asia/Seoul), true) AS pickup_ts#96L]
      :        :     :                                            +- Project [hvfhs_license_num#0, request_datetime#3, on_scene_datetime#4, pickup_datetime#5, dropoff_datetime#6, PULocationID#7, DOLocationID#8, trip_miles#9, trip_time#10L, base_passenger_fare#11, tolls#12, bcf#13, sales_tax#14, congestion_surcharge#15, airport_fee#16, tips#17, driver_pay#18]
      :        :     :                                               +- Union false, false
      :        :     :                                                  :- Sample 0.0, 0.1, false, 42
      :        :     :                                                  :  +- Relation [hvfhs_license_num#0,dispatching_base_num#1,originating_base_num#2,request_datetime#3,on_scene_datetime#4,pickup_datetime#5,dropoff_datetime#6,PULocationID#7,DOLocationID#8,trip_miles#9,trip_time#10L,base_passenger_fare#11,tolls#12,bcf#13,sales_tax#14,congestion_surcharge#15,airport_fee#16,tips#17,driver_pay#18,shared_request_flag#19,shared_match_flag#20,access_a_ride_flag#21,wav_request_flag#22,wav_match_flag#23] parquet
      :        :     :                                                  :- Sample 0.0, 0.1, false, 42
      :        :     :                                                  :  +- Relation [hvfhs_license_num#24,dispatching_base_num#25,originating_base_num#26,request_datetime#27,on_scene_datetime#28,pickup_datetime#29,dropoff_datetime#30,PULocationID#31,DOLocationID#32,trip_miles#33,trip_time#34L,base_passenger_fare#35,tolls#36,bcf#37,sales_tax#38,congestion_surcharge#39,airport_fee#40,tips#41,driver_pay#42,shared_request_flag#43,shared_match_flag#44,access_a_ride_flag#45,wav_request_flag#46,wav_match_flag#47] parquet
      :        :     :                                                  :- Sample 0.0, 0.1, false, 42
      :        :     :                                                  :  +- Relation [hvfhs_license_num#48,dispatching_base_num#49,originating_base_num#50,request_datetime#51,on_scene_datetime#52,pickup_datetime#53,dropoff_datetime#54,PULocationID#55,DOLocationID#56,trip_miles#57,trip_time#58L,base_passenger_fare#59,tolls#60,bcf#61,sales_tax#62,congestion_surcharge#63,airport_fee#64,tips#65,driver_pay#66,shared_request_flag#67,shared_match_flag#68,access_a_ride_flag#69,wav_request_flag#70,wav_match_flag#71] parquet
      :        :     :                                                  +- Sample 0.0, 0.1, false, 42
      :        :     :                                                     +- Relation [hvfhs_license_num#72,dispatching_base_num#73,originating_base_num#74,request_datetime#75,on_scene_datetime#76,pickup_datetime#77,dropoff_datetime#78,PULocationID#79,DOLocationID#80,trip_miles#81,trip_time#82L,base_passenger_fare#83,tolls#84,bcf#85,sales_tax#86,congestion_surcharge#87,airport_fee#88,tips#89,driver_pay#90,shared_request_flag#91,shared_match_flag#92,access_a_ride_flag#93,wav_request_flag#94,wav_match_flag#95] parquet
      :        :     +- SubqueryAlias pu_zones
      :        :        +- Relation [LocationID#126,Borough#127,Zone#128,service_zone#129] csv
      :        +- SubqueryAlias do_zones
      :           +- Relation [LocationID#132,Borough#133,Zone#134,service_zone#135] csv
      +- Project [year#155, month#156, day#157, max#168, min#169, precipitation#167]
         +- Project [year#155, month#156, day#157, max#168, round((cast(((cast(min#159 as bigint) - cast(32 as bigint)) * cast(5 as bigint)) as double) / cast(9 as double)), 1) AS min#169, precipitation1#164, precipitation2#165, precipitation3#166, precipitation#167]
            +- Project [year#155, month#156, day#157, round((cast(((cast(max#158 as bigint) - cast(32 as bigint)) * cast(5 as bigint)) as double) / cast(9 as double)), 1) AS max#168, min#159, precipitation1#164, precipitation2#165, precipitation3#166, precipitation#167]
               +- Project [year#155, month#156, day#157, max#158, min#159, precipitation1#164, precipitation2#165, precipitation3#166, ((precipitation1#164 + precipitation2#165) + precipitation3#166) AS precipitation#167]
                  +- Project [year#155, month#156, day#157, max#158, min#159, precipitation1#164, precipitation2#165, cast(regexp_replace(precipitation3#162, T, 0, 1) as float) AS precipitation3#166]
                     +- Project [year#155, month#156, day#157, max#158, min#159, precipitation1#164, cast(regexp_replace(precipitation2#161, T, 0, 1) as float) AS precipitation2#165, precipitation3#162]
                        +- Project [year#155, month#156, day#157, max#158, min#159, cast(regexp_replace(precipitation1#160, T, 0, 1) as float) AS precipitation1#164, precipitation2#161, precipitation3#162]
                           +- Relation [year#155,month#156,day#157,max#158,min#159,precipitation1#160,precipitation2#161,precipitation3#162] csv


: 

: 