In [4]:
!pip install -q holidays

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)

In [2]:
path_to_rawdata = 'data/green_taxi/2019/green_tripdata_2019-06.csv.bz2'
path_to_geoinfo = 'shape/'

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
import holidays
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from IPython.display import clear_output

In [6]:
from filter import filter_data
from load import load_data

In [7]:
MEMORY_SIZE = '5g'
spark = SparkSession \
    .builder \
    .config('spark.driver.memory', MEMORY_SIZE) \
    .appName('spark-taxi') \
    .getOrCreate()

In [8]:
df = load_data(spark, path_to_rawdata)
df = filter_data(df)

In [9]:
count_by_zones = df.groupBy('PULocationID').count().toPandas()
count_by_zones.sort_values('count', ascending=False).iloc[:10,0].to_list()

[74, 75, 41, 82, 7, 166, 42, 95, 97, 129]

In [9]:
w = Window().partitionBy("PULocationID").orderBy("lpep_pickup_datetime")
pause_secs = col("lpep_pickup_datetime").cast("long") - lag(col("lpep_pickup_datetime"), 1).over(w).cast("long")
lpep_pickup_datetime_lag = lag(col("lpep_pickup_datetime"), 1).over(w)

df = df.withColumn("duration_wait", (pause_secs / 60)) \
        .withColumn('lpep_pickup_datetime_lag', lpep_pickup_datetime_lag) \
        .dropna()

In [10]:
us_holidays = list(holidays.UnitedStates(years=[2017, 2018, 2019]).keys())

actions - (ехать в район i, ждать заказ в данном районе)
s >> ехать в район i >> (s',r)
s >> ждать заказ >> p(s',r)

In [11]:
def prep_state(df, place, data_col, name):
    
    state_cols = ['hour', 'weekday']
    
    df = df.withColumn("weekday", dayofweek(col(data_col))) \
            .withColumn('holiday', when(col(data_col).isin(us_holidays), 1).otherwise(0)) \
            .withColumn("hour", hour(col(data_col)))
    
    df = df.withColumn(name, struct([col(place).alias('location')] + state_cols))
    
    df = df.drop(*state_cols)
    
    return df

In [12]:
df = prep_state(df, 'PULocationID', 'lpep_pickup_datetime', 'state_pickup')
df = prep_state(df, 'DOLocationID', 'lpep_dropoff_datetime', 'state_dropoff')
df = prep_state(df, 'PULocationID', 'lpep_pickup_datetime_lag', 'state_pickup_lag')
#drop_cols = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'lpep_pickup_datetime_lag']
#df = df.drop(*drop_cols)

In [13]:
df.cache()
df.limit(5).toPandas()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,tip_amount,tolls_amount,...,payment_type,trip_type,duration,speed,duration_wait,lpep_pickup_datetime_lag,holiday,state_pickup,state_dropoff,state_pickup_lag
0,2019-06-01 01:13:33,2019-06-01 01:25:55,1,243,241,1,2.57,11.0,0.0,0.0,...,2,1,12,12.85,37.016667,2019-06-01 00:36:32,0,"(243, 1, 7)","(241, 1, 7)","(243, 0, 7)"
1,2019-06-01 01:16:22,2019-06-01 01:32:17,1,243,248,1,5.09,17.5,0.0,0.0,...,2,2,15,20.360001,2.816667,2019-06-01 01:13:33,0,"(243, 1, 7)","(248, 1, 7)","(243, 1, 7)"
2,2019-06-01 03:21:28,2019-06-01 03:30:46,1,243,116,1,1.72,8.5,0.0,0.0,...,2,1,9,11.466667,125.1,2019-06-01 01:16:22,0,"(243, 3, 7)","(116, 3, 7)","(243, 1, 7)"
3,2019-06-01 04:13:37,2019-06-01 04:41:01,1,243,4,1,12.34,35.5,3.0,0.0,...,1,1,27,27.422223,52.15,2019-06-01 03:21:28,0,"(243, 4, 7)","(4, 4, 7)","(243, 3, 7)"
4,2019-06-01 05:07:01,2019-06-01 05:10:21,1,243,243,1,0.86,4.5,0.0,0.0,...,2,1,3,17.2,53.4,2019-06-01 04:13:37,0,"(243, 5, 7)","(243, 5, 7)","(243, 4, 7)"


### Остаться в этом районе

In [14]:
df1 = df.select([col('lpep_pickup_datetime_lag').alias('datetime'),
                col('state_pickup_lag').alias('state'), 
               col('state_pickup_lag').location.alias('action'),
               (col('fare_amount') - col('duration_wait') * 5.0 - col('trip_distance') * 1.60934 * 0.1 * 0.67).alias('reward'),
               col('state_dropoff').alias('next_state'),
               (col('duration') + col('duration_wait')).alias('duration_trip'),
               (col('fare_amount') - col('trip_distance') * 1.60934 * 0.1 * 0.67).alias('fare')
              ])

### Ехать в другой район

In [15]:
df2 = df.filter('state_pickup.location != state_dropoff.location') \
        .select([col('lpep_pickup_datetime').alias('datetime'),
                col('state_pickup').alias('state'), 
               col('state_dropoff').location.alias('action'),
               (col('total_amount') * -1).alias('reward'),
               col('state_dropoff').alias('next_state'),
               col('duration').alias('duration_trip'),
               (col('trip_distance') * 1.60934 * 0.1 * 0.67 * -1).alias('fare')
              ])

In [16]:
data = df1.union(df2)

In [17]:
data.repartition(8).write.parquet('data/green_taxi_prepare___/')