In [12]:
from pathlib import Path
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Analysis") \
    .master("local[*]") \
    .getOrCreate()

data_path = Path("../data/raw")
all_files = sorted(data_path.glob("yellow_tripdata_2016-*.parquet"))

# Разделяем файлы по назначению
train_files = [str(f) for f in all_files if f.name <= "yellow_tripdata_2016-05.parquet"]
eval_files  = [str(f) for f in all_files if f.name == "yellow_tripdata_2016-11.parquet"]
holdout_files = [str(f) for f in all_files if f.name == "yellow_tripdata_2016-12.parquet"]

# Читаем DataFrames
train = spark.read.parquet(*train_files)
eval = spark.read.parquet(*eval_files) if eval_files else None
holdout = spark.read.parquet(*holdout_files) if holdout_files else None

# Проверка
print("Train files:", train_files)
print("Eval files:", eval_files)
print("Holdout files:", holdout_files)
spark.conf.set("spark.sql.debug.maxToStringFields", 200)  # увеличить лимит вывода колонок
train.printSchema()


Train files: ['../data/raw/yellow_tripdata_2016-01.parquet', '../data/raw/yellow_tripdata_2016-02.parquet', '../data/raw/yellow_tripdata_2016-03.parquet', '../data/raw/yellow_tripdata_2016-04.parquet', '../data/raw/yellow_tripdata_2016-05.parquet']
Eval files: ['../data/raw/yellow_tripdata_2016-11.parquet']
Holdout files: ['../data/raw/yellow_tripdata_2016-12.parquet']
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)

In [14]:
train_pd = train.limit(10).toPandas()
train_pd.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2016-01-01 00:12:22,2016-01-01 00:29:14,1,3.2,1,N,48,262,1,14.0,0.5,0.5,3.06,0.0,0.3,18.36,,
1,1,2016-01-01 00:41:31,2016-01-01 00:55:10,2,1.0,1,N,162,48,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,,
2,1,2016-01-01 00:53:37,2016-01-01 00:59:57,1,0.9,1,N,246,90,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,,
3,1,2016-01-01 00:13:28,2016-01-01 00:18:07,1,0.8,1,N,170,162,2,5.0,0.5,0.5,0.0,0.0,0.3,6.3,,
4,1,2016-01-01 00:33:04,2016-01-01 00:47:14,1,1.8,1,N,161,140,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3,,
