In [1]:
!pip install -q holidays

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)

In [3]:
path_to_rawdata = 'data/green_taxi/green_tripdata_2017-05.csv.bz2'
path_to_geoinfo = 'shape/'

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
import holidays
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from IPython.display import clear_output

In [5]:
from filter import filter_data
from load import load_data

In [6]:
MEMORY_SIZE = '5g'
spark = SparkSession \
    .builder \
    .config('spark.driver.memory', MEMORY_SIZE) \
    .appName('spark-taxi') \
    .getOrCreate()

In [7]:
df = load_data(spark, path_to_rawdata)
df = filter_data(df)
df.cache()

DataFrame[lpep_pickup_datetime: timestamp, lpep_dropoff_datetime: timestamp, RatecodeID: tinyint, PULocationID: int, DOLocationID: int, passenger_count: tinyint, trip_distance: float, fare_amount: float, tip_amount: float, tolls_amount: float, total_amount: float, payment_type: tinyint, trip_type: tinyint, duration: int, speed: double]

Погода:

In [8]:
temp = spark.read \
        .format('csv') \
        .option('header', 'true') \
        .option('delimiter', ',') \
        .load('data/weather/temperature.csv').select([col('datetime').cast(TimestampType()),
                                                      col('New York').alias('temp').cast(FloatType())]).dropna()
humidity = spark.read \
        .format('csv') \
        .option('header', 'true') \
        .option('delimiter', ',') \
        .load('data/weather/humidity.csv').select([col('datetime').cast(TimestampType()),
                                                      col('New York').alias('humidity').cast(FloatType())]).dropna()
pressure = spark.read \
        .format('csv') \
        .option('header', 'true') \
        .option('delimiter', ',') \
        .load('data/weather/pressure.csv').select([col('datetime').cast(TimestampType()),
                                                      col('New York').alias('pressure').cast(FloatType())]).dropna()


In [9]:
weather = temp.join(humidity, on='datetime', how='inner').join(pressure, on='datetime', how='inner')
weather.show(5)

+-------------------+---------+--------+--------+
|           datetime|     temp|humidity|pressure|
+-------------------+---------+--------+--------+
|2012-10-01 13:00:00|   288.22|    58.0|  1012.0|
|2012-10-01 14:00:00|288.24768|    57.0|  1012.0|
|2012-10-01 15:00:00|288.32693|    57.0|  1012.0|
|2012-10-01 16:00:00| 288.4062|    57.0|  1012.0|
|2012-10-01 17:00:00|288.48547|    57.0|  1012.0|
+-------------------+---------+--------+--------+
only showing top 5 rows



In [10]:
def norm_weather(column):
    u = weather.select(mean(column)).toPandas().values[0][0]
    z = weather.select(stddev(column)).toPandas().values[0][0]
    return weather.withColumn(column, (col(column) - u) / z)

In [11]:
weather = norm_weather('temp')
weather = norm_weather('humidity')
weather = norm_weather('pressure')

In [12]:
weather.show(5)

+-------------------+-------------------+--------------------+-------------------+
|           datetime|               temp|            humidity|           pressure|
+-------------------+-------------------+--------------------+-------------------+
|2012-10-01 13:00:00|0.26846161172475763| -0.4356924169388144|-0.4967712982180166|
|2012-10-01 14:00:00|0.27116760939632845|-0.48602596911661466|-0.4967712982180166|
|2012-10-01 15:00:00|0.27891565344601904|-0.48602596911661466|-0.4967712982180166|
|2012-10-01 16:00:00|0.28666369749570964|-0.48602596911661466|-0.4967712982180166|
|2012-10-01 17:00:00| 0.2944147250047956|-0.48602596911661466|-0.4967712982180166|
+-------------------+-------------------+--------------------+-------------------+
only showing top 5 rows



In [13]:
#count_by_zones = df.groupBy('PULocationID').count().toPandas()

In [14]:
w = Window().partitionBy("PULocationID").orderBy("lpep_pickup_datetime")
pause_secs = col("lpep_pickup_datetime").cast("long") - lag(col("lpep_pickup_datetime"), 1).over(w).cast("long")
lpep_pickup_datetime_lag = lag(col("lpep_pickup_datetime"), 1).over(w)

df = df.withColumn("duration_wait", (pause_secs / 60)) \
        .withColumn('lpep_pickup_datetime_lag', lpep_pickup_datetime_lag) \
        .dropna().cache()

In [15]:
us_holidays = list(holidays.UnitedStates(years=[2017, 2018, 2019]).keys())

actions - (ехать в район i, ждать заказ в данном районе)
s >> ехать в район i >> (s',r)
s >> ждать заказ >> p(s',r)

In [16]:
def prep_state(df, place, data_col, name):
    
    state_cols = ['hour', 'weekday', 'holiday', 'temp', 'humidity', 'pressure']
    
    df = df.withColumn("weekday", dayofweek(col(data_col))) \
            .withColumn('holiday', when(col(data_col).isin(us_holidays), 1).otherwise(0)) \
            .withColumn("hour", hour(col(data_col)))
    
    df = df.join(weather, on=[date_trunc('hour', col(data_col)) == weather.datetime], how='leftouter').drop('datetime')
    
    df = df.withColumn(name, struct([col(place).alias('location')] + state_cols))
    
    df = df.drop(*state_cols)
    
    return df

In [17]:
df = prep_state(df, 'PULocationID', 'lpep_pickup_datetime', 'state_pickup')
df = prep_state(df, 'DOLocationID', 'lpep_dropoff_datetime', 'state_dropoff')
df = prep_state(df, 'PULocationID', 'lpep_pickup_datetime_lag', 'state_pickup_lag')
#drop_cols = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'lpep_pickup_datetime_lag']
#df = df.drop(*drop_cols)

In [18]:
df.cache()
df.limit(5).toPandas()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,tip_amount,tolls_amount,total_amount,payment_type,trip_type,duration,speed,duration_wait,lpep_pickup_datetime_lag,state_pickup,state_dropoff,state_pickup_lag
0,2017-05-06 15:13:44,2017-05-06 15:23:12,1,243,235,1,1.88,9.0,0.0,0.0,9.8,2,1,9,12.533333,10.366667,2017-05-06 15:03:22,"(243, 15, 7, 0, 0.40337364558190403, -0.385358...","(235, 15, 7, 0, 0.40337364558190403, -0.385358...","(243, 15, 7, 0, 0.40337364558190403, -0.385358..."
1,2017-05-06 15:21:50,2017-05-06 15:38:28,1,243,143,1,6.53,20.5,4.26,0.0,25.559999,1,1,16,24.487501,8.1,2017-05-06 15:13:44,"(243, 15, 7, 0, 0.40337364558190403, -0.385358...","(143, 15, 7, 0, 0.40337364558190403, -0.385358...","(243, 15, 7, 0, 0.40337364558190403, -0.385358..."
2,2017-05-06 15:23:04,2017-05-06 15:41:13,1,243,236,1,6.28,20.0,4.16,0.0,24.959999,1,1,18,20.933334,1.233333,2017-05-06 15:21:50,"(243, 15, 7, 0, 0.40337364558190403, -0.385358...","(236, 15, 7, 0, 0.40337364558190403, -0.385358...","(243, 15, 7, 0, 0.40337364558190403, -0.385358..."
3,2017-05-06 15:24:03,2017-05-06 15:34:31,1,243,74,1,4.51,14.5,3.06,0.0,18.360001,1,1,10,27.060001,0.983333,2017-05-06 15:23:04,"(243, 15, 7, 0, 0.40337364558190403, -0.385358...","(74, 15, 7, 0, 0.40337364558190403, -0.3853588...","(243, 15, 7, 0, 0.40337364558190403, -0.385358..."
4,2017-05-06 15:38:28,2017-05-06 15:54:15,1,243,74,1,4.58,16.5,2.7,0.0,20.0,1,1,15,18.32,14.416667,2017-05-06 15:24:03,"(243, 15, 7, 0, 0.40337364558190403, -0.385358...","(74, 15, 7, 0, 0.40337364558190403, -0.3853588...","(243, 15, 7, 0, 0.40337364558190403, -0.385358..."


### Остаться в этом районе

In [19]:
df1 = df.filter('state_pickup.location == state_dropoff.location') \
        .select([col('lpep_pickup_datetime_lag').alias('datetime'),
                col('state_pickup_lag').alias('state'), 
               col('state_pickup_lag').location.alias('action'),
               (col('fare_amount') - col('duration_wait') * 0.5 * 0.1).alias('reward'),
               col('state_dropoff').alias('next_state'),
               (col('duration') + col('duration_wait')).alias('duration_trip')
              ])

### Ехать в другой район

In [20]:
df2 = df.filter('state_pickup.location != state_dropoff.location') \
        .select([col('lpep_pickup_datetime').alias('datetime'),
                col('state_pickup').alias('state'), 
               col('state_dropoff').location.alias('action'),
               (col('fare_amount') * -1. * 0.1).alias('reward'),
               col('state_dropoff').alias('next_state'),
               col('duration').alias('duration_trip')
              ])

In [21]:
data = df1.union(df2)

In [22]:
data.show(5)

+-------------------+--------------------+------+------------------+--------------------+------------------+
|           datetime|               state|action|            reward|          next_state|     duration_trip|
+-------------------+--------------------+------+------------------+--------------------+------------------+
|2017-05-06 15:42:23|[243, 15, 7, 0, 0...|   243|4.3133333333333335|[243, 15, 7, 0, 0...| 6.733333333333333|
|2017-05-06 15:00:12|[255, 15, 7, 0, 0...|   255|5.4816666666666665|[255, 15, 7, 0, 0...| 5.366666666666666|
|2017-05-06 15:14:50|[255, 15, 7, 0, 0...|   255| 9.485833333333334|[255, 15, 7, 0, 0...|14.283333333333333|
|2017-05-06 15:28:04|[255, 15, 7, 0, 0...|   255| 4.413333333333333|[255, 15, 7, 0, 0...| 5.733333333333333|
|2017-05-06 15:30:57|[255, 15, 7, 0, 0...|   255|5.4158333333333335|[255, 15, 7, 0, 0...| 7.683333333333334|
+-------------------+--------------------+------+------------------+--------------------+------------------+
only showing top 5 

In [23]:
data.repartition(8).write.parquet('data/green_taxi_prepare_weather/')