Puesta en marcha

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

spark = SparkSession.builder.master('local[*]').getOrCreate()

sc = spark.sparkContext

spark

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 8.0 gigabytes of available RAM



Además, podemos conseguir el grado de paralelismo, es decir, el número de cores empleados.

In [4]:
spark.sparkContext.defaultParallelism

16

In [5]:
import pyarrow.parquet as pq
dataset_parquet = spark.read.parquet('dataset/yellow_tripdata_2017-01.parquet', header=True, inferSchema=True)
#revisar repartion() y coalese()
dataset_parquet.createOrReplaceTempView('tabledataset')
dfsql = spark.sql("SELECT * FROM tabledataset")

In [6]:
#To know the values of the dataframe
dfsql.show(5)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2017-01-01 00:32:05|  2017-01-01 00:37:48|              1|          1.2|         1|                 N|         140|         236|           2|        6.5|  0.5|    0.5|       0.

In [7]:
#Check data types of each column
filas = spark.sql("SELECT COUNT(*) as Filas FROM tabledataset")
filas.show()
dfsql.printSchema()

+-------+
|  Filas|
+-------+
|9710820|
+-------+

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: integer (nullable = true)
 |-- airport_fee: integer (nullable = true)



In [24]:
#Check null values
null_counts = spark.sql("""SELECT'VendorID' AS VendorID,
                        SUM(CASE WHEN VendorID IS NULL THEN 1 ELSE 0 END) AS ContCol1,
                        'tpep_pickup_datetime' AS tpep_pickup_datetime,        
                        SUM(CASE WHEN tpep_pickup_datetime IS NULL THEN 1 ELSE 0 END) AS ContCol2, 
                        'tpep_dropoff_datetime' AS tpep_dropoff_datetime,        
                        SUM(CASE WHEN tpep_dropoff_datetime IS NULL THEN 1 ELSE 0 END) AS ContCol3, 
                        'passenger_count' AS passenger_count,        
                        SUM(CASE WHEN passenger_count IS NULL THEN 1 ELSE 0 END) AS ContCol4, 
                        'trip_distance' AS trip_distance,        
                        SUM(CASE WHEN trip_distance IS NULL THEN 1 ELSE 0 END) AS ContCol5, 
                        'RatecodeID' AS RatecodeID,        
                        SUM(CASE WHEN RatecodeID IS NULL THEN 1 ELSE 0 END) AS ContCol6, 
                        'store_and_fwd_flag' AS store_and_fwd_flag,        
                        SUM(CASE WHEN store_and_fwd_flag IS NULL THEN 1 ELSE 0 END) AS ContCol7, 
                        'PULocationID' AS PULocationID,        
                        SUM(CASE WHEN PULocationID IS NULL THEN 1 ELSE 0 END) AS ContCol8, 
                        'DOLocationID' AS DOLocationID,        
                        SUM(CASE WHEN DOLocationID IS NULL THEN 1 ELSE 0 END) AS ContCol9, 
                        'payment_type' AS payment_type,        
                        SUM(CASE WHEN payment_type IS NULL THEN 1 ELSE 0 END) AS ContCol10, 
                        'fare_amount' AS fare_amount,        
                        SUM(CASE WHEN fare_amount IS NULL THEN 1 ELSE 0 END) AS ContCol11, 
                        'extra' AS extra,        
                        SUM(CASE WHEN extra IS NULL THEN 1 ELSE 0 END) AS ContCol12, 
                        'mta_tax' AS mta_tax,        
                        SUM(CASE WHEN mta_tax IS NULL THEN 1 ELSE 0 END) AS ContCol13, 
                        'tip_amount' AS tip_amount,        
                        SUM(CASE WHEN tip_amount IS NULL THEN 1 ELSE 0 END) AS ContCol14, 
                        'tolls_amount' AS tolls_amount,        
                        SUM(CASE WHEN tolls_amount IS NULL THEN 1 ELSE 0 END) AS ContCol15, 
                        'improvement_surcharge' AS improvement_surcharge,        
                        SUM(CASE WHEN improvement_surcharge IS NULL THEN 1 ELSE 0 END) AS ContCol16, 
                        'total_amount' AS total_amount,        
                        SUM(CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END) AS ContCol17,
                        'congestion_surcharge' AS congestion_surcharge,        
                        SUM(CASE WHEN congestion_surcharge IS NULL THEN 1 ELSE 0 END) AS ContCol18, 
                        'airport_fee' AS airport_fee,        
                        SUM(CASE WHEN airport_fee IS NULL THEN 1 ELSE 0 END) AS ContCol19
                        
                        FROM tabledataset"""    )

In [25]:
null_counts.show()


+--------+--------+--------------------+--------+---------------------+--------+---------------+--------+-------------+--------+----------+--------+------------------+--------+------------+--------+------------+--------+------------+---------+-----------+---------+-----+---------+-------+---------+----------+---------+------------+---------+---------------------+---------+------------+---------+--------------------+---------+-----------+---------+
|VendorID|ContCol1|tpep_pickup_datetime|ContCol2|tpep_dropoff_datetime|ContCol3|passenger_count|ContCol4|trip_distance|ContCol5|RatecodeID|ContCol6|store_and_fwd_flag|ContCol7|PULocationID|ContCol8|DOLocationID|ContCol9|payment_type|ContCol10|fare_amount|ContCol11|extra|ContCol12|mta_tax|ContCol13|tip_amount|ContCol14|tolls_amount|ContCol15|improvement_surcharge|ContCol16|total_amount|ContCol17|congestion_surcharge|ContCol18|airport_fee|ContCol19|
+--------+--------+--------------------+--------+---------------------+--------+---------------+

In [28]:
# As all the values of congestion_surcharge and airport_fee are null, in SQL you cannot delete, therefore
#select only the necessary columns
dfsqlreduce = spark.sql("""SELECT VendorID,
                                  tpep_pickup_datetime,
                                  tpep_dropoff_datetime,
                                  passenger_count,
                                  trip_distance,
                                  RatecodeID,
                                  store_and_fwd_flag,
                                  PULocationID,
                                  DOLocationID,
                                  payment_type,
                                  fare_amount,
                                  extra,
                                  mta_tax,
                                  tip_amount,
                                  tolls_amount,
                                  improvement_surcharge,
                                  total_amount
                            FROM tabledataset    
                        """)

Velocidad media de los taxis en función de la hora

In [68]:
#Let's calculate the difference of time between the pick-up and the drop-off in seconds
#Difference DateTime
#Velocity
#Hour from tpep_pickup_datetime
#Hour from tpep_dropoff_datetime
dfsqlreduce = spark.sql("""SELECT *,
                                (unix_timestamp(tpep_dropoff_datetime) - unix_timestamp(tpep_pickup_datetime)) AS difference_datetime,
                                trip_distance/(( (unix_timestamp(tpep_dropoff_datetime) - unix_timestamp(tpep_pickup_datetime))/60 )/60) AS velocity,
                                HOUR(tpep_pickup_datetime) AS pickup_hour,
                                HOUR(tpep_dropoff_datetime) AS dropoff_hour
                            FROM tabledataset
                            WHERE tpep_dropoff_datetime >= tpep_pickup_datetime
                        """) 

In [69]:
dfsqlreduce.show(2)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+-------------------+-----------------+-----------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|difference_datetime|         velocity|pickup_hour|dropoff_hour|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+-------------------+-----------------+-----------+------------

In [71]:
#Calculate the velocity avg of each hour
#dfsqlreduce_avg = dfsqlreduce
dfsqlreduce.createOrReplaceTempView('tabledataset2')

In [72]:
dfsqlreduce_avg = spark.sql("""SELECT 
                                pickup_hour,
                                avg(velocity) as hourly_average        
                            FROM tabledataset2
                            GROUP BY pickup_hour
                            """) 