# Query (for experiment)

In [1]:
from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession.builder.appName("query")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "5g")
    .config("spark.executor.memory", "3g")
    .config("spark.driver.maxResultSize", "3g")
    .getOrCreate()
)

import pandas as pd

23/08/19 15:51:18 WARN Utils: Your hostname, Didis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.100.27 instead (on interface en0)
23/08/19 15:51:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/19 15:51:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/19 15:51:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/19 15:51:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/08/19 15:51:19 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
train = spark.read.parquet("../data/curated/FHV_train.parquet")
test = spark.read.parquet("../data/curated/FHV_test.parquet")
landuse = spark.read.parquet("../data/curated/PLUTO.parquet")

train.show(1, vertical=True, truncate=100)

                                                                                

-RECORD 0--------------------------
 PULocationID | 148                
 DOLocationID | 249                
 trip_miles   | 1.981              
 trip_time    | 1000               
 driver_pay   | 11.8               
 PUhour       | 3                  
 wait_time    | 301                
 pay_per_min  | 0.5441967717140661 
 pay_per_mile | 5.956587582029278  
only showing top 1 row



In [3]:
agg_train = train \
            .groupBy('PULocationID', 'PUhour') \
            .agg(
                F.mean('trip_miles').alias("avg_trip_mile"),
                F.mean('pay_per_min').alias("avg_ppmin"),
                F.mean('wait_time').alias("avg_wait"),
                F.mean('trip_time').alias("avg_trip_time")
            )

agg_test = test \
            .groupBy('PULocationID', 'PUhour') \
            .agg(
                F.mean('trip_miles').alias("avg_trip_mile"),
                F.mean('pay_per_min').alias("avg_ppmin"),
                F.mean('wait_time').alias("avg_wait"),
                F.mean('trip_time').alias("avg_trip_time")
            )

agg_train = agg_train \
              .join(landuse, train['PULocationID'] == landuse['LocationID'], how="inner") \
              .drop('LocationID')

agg_test = agg_test \
            .join(landuse, test['PULocationID'] == landuse['LocationID'], how="inner") \
            .drop('LocationID')
            
agg_train.show(1, vertical=True, truncate=100)




-RECORD 0-------------------------------
 PULocationID     | 207                 
 PUhour           | 4                   
 avg_trip_mile    | 3.6907680112382093  
 avg_ppmin        | 0.7761222604687213  
 avg_wait         | 217.19064820389323  
 avg_trip_time    | 579.7672085089304   
 total_bldg       | 394205.0            
 prop_commertial  | 0.5248081581918037  
 prop_residential | 0.18443449474258317 
 prop_office      | 0.01433264418259535 
 prop_retail      | 0.0                 
only showing top 1 row



                                                                                

`Variance of price, group by PUhour`

In [12]:
agg_train.createOrReplaceTempView('query')
sql = spark.sql("""
SELECT 
    PUhour, 
    VARIANCE(avg_ppmin)
FROM
    query
GROUP BY
    PUhour
ORDER BY
    VARIANCE(avg_ppmin) DESC
""")

sql

                                                                                

PUhour,variance(avg_ppmin)
4,0.013698220835308618
5,0.011220198412336858
23,0.008964814713425926
22,0.008400018593361681
6,0.007872875085357476
0,0.007812362984964895
7,0.00691578594707718
3,0.006571439153772...
1,0.006370500743186889
2,0.006153918532038...


`Variance of price, group by PULocationID`

In [13]:
agg_train.createOrReplaceTempView('query')
sql = spark.sql("""
SELECT 
    PULocationID, 
    VARIANCE(avg_ppmin)
FROM
    query
GROUP BY
    PULocationID
ORDER BY
    VARIANCE(avg_ppmin) DESC
""")

sql

                                                                                

PULocationID,variance(avg_ppmin)
253,0.0227067114180567
93,0.0185403973938183
199,0.0145571509018009
211,0.014427393761874
144,0.0138522390948444
114,0.0119599610404303
80,0.0109082091317764
125,0.0104417751169538
249,0.0104112558489464
138,0.0103783410884712


`Locations with highest proportions of each landuse`

In [23]:
# Highest residential
sql_plot = spark.sql("""
SELECT 
    PULocationID, 
    AVG(prop_residential)
FROM
    query
GROUP BY 
    PULocationID
ORDER BY
    AVG(prop_residential) DESC
LIMIT 1
""")
sql_plot

                                                                                

PULocationID,avg(prop_residential)
38,0.9775249591068744


In [25]:
# Highest office
sql_plot = spark.sql("""
SELECT 
    PULocationID, 
    AVG(prop_office)
FROM
    query
GROUP BY 
    PULocationID
ORDER BY
    AVG(prop_office) DESC
LIMIT 1
""")
sql_plot

                                                                                

PULocationID,avg(prop_office)
261,0.2631093078546747


In [26]:
# Highest retail
sql_plot = spark.sql("""
SELECT 
    PULocationID, 
    AVG(prop_retail)
FROM
    query
GROUP BY 
    PULocationID
ORDER BY
    AVG(prop_retail) DESC
LIMIT 1
""")
sql_plot

                                                                                

PULocationID,avg(prop_retail)
100,0.0542281676551941


In [27]:
# Highest commertial
sql_plot = spark.sql("""
SELECT 
    PULocationID, 
    AVG(prop_commertial)
FROM
    query
GROUP BY 
    PULocationID
ORDER BY
    AVG(prop_commertial) DESC
LIMIT 1
""")
sql_plot

                                                                                

PULocationID,avg(prop_commertial)
110,1.0


23/08/19 19:11:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 906296 ms exceeds timeout 120000 ms
23/08/19 19:11:17 WARN SparkContext: Killing executors is not supported by current scheduler.
23/08/19 19:11:24 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.B