# Imports

In [1]:
import sys, os
is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))

if not is_conda:
    import findspark 
    findspark.init()

from pyspark.sql import SparkSession
# import pandas as pd
# import numpy as np
# import matplotlib as plt
from datetime import datetime
from pyspark.sql.functions import col, datediff, unix_timestamp

# Lectura de datos

In [2]:
spark = SparkSession.builder.appName("taxis").master("local[*]").getOrCreate()
df = spark.read.csv('./tripdata_2017_01.csv', header=True, inferSchema=True)

In [3]:
df.printSchema()
dfP=df.toPandas()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)



# Limpieza de datos

In [4]:
display(dfP)
display(dfP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-09 11:13:28,2017-01-09 11:25:45,1,3.30,1,N,263,161,1,12.5,0.0,0.5,2.00,0.00,0.3,15.30
1,1,2017-01-09 11:32:27,2017-01-09 11:36:01,1,0.90,1,N,186,234,1,5.0,0.0,0.5,1.45,0.00,0.3,7.25
2,1,2017-01-09 11:38:20,2017-01-09 11:42:05,1,1.10,1,N,164,161,1,5.5,0.0,0.5,1.00,0.00,0.3,7.30
3,1,2017-01-09 11:52:13,2017-01-09 11:57:36,1,1.10,1,N,236,75,1,6.0,0.0,0.5,1.70,0.00,0.3,8.50
4,2,2017-01-01 00:00:00,2017-01-01 00:00:00,1,0.02,2,N,249,234,2,52.0,0.0,0.5,0.00,0.00,0.3,52.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971005,2,2017-01-17 13:16:22,2017-01-17 13:35:14,1,3.43,1,N,264,264,2,14.5,0.0,0.5,0.00,0.00,0.3,15.30
971006,2,2017-01-17 13:16:23,2017-01-17 13:23:01,1,0.78,1,N,161,237,1,6.0,0.0,0.5,1.36,0.00,0.3,8.16
971007,2,2017-01-17 13:16:23,2017-01-17 13:23:51,1,0.95,1,N,237,229,2,6.5,0.0,0.5,0.00,0.00,0.3,7.30
971008,2,2017-01-17 13:16:23,2017-01-17 13:32:45,1,1.68,1,N,170,48,1,11.5,0.0,0.5,2.46,0.00,0.3,14.76


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,971010.0,1.556951,0.496746,1.0,1.0,2.0,2.0,2.0
passenger_count,971010.0,1.685989,1.291705,0.0,1.0,1.0,2.0,9.0
trip_distance,971010.0,3.031285,3.785847,0.0,1.0,1.7,3.25,151.7
RatecodeID,971010.0,1.044729,0.461631,1.0,1.0,1.0,1.0,99.0
PULocationID,971010.0,160.378031,67.938198,1.0,113.0,161.0,231.0,265.0
DOLocationID,971010.0,158.556712,72.276821,1.0,100.0,161.0,233.0,265.0
payment_type,971010.0,1.371974,0.503326,1.0,1.0,1.0,2.0,4.0
fare_amount,971010.0,13.108188,546.736933,-120.0,6.5,9.0,14.0,538579.2
extra,971010.0,0.209993,0.261634,-1.0,0.0,0.0,0.5,55.54
mta_tax,971010.0,0.497214,0.040586,-0.5,0.5,0.5,0.5,0.5


### Elementos extraños en el dataset

Lista de comportamientos extraños en los datos, y por tanto, inválidos a la hora de utilizar datos que deberían ser coherentes basándonos en la información de cada campo proporcionada por la [documentación](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

* Existen carreras en las que la distancia es 0
* Existen propinas negativas
* "extra" con valores diferentes a 0 (ya que puede no haber extras), 0.5 y 1
* Existen viajes con un precio final negativo
* "MTA_tax" debe valer siempre 0.50. Valores diferentes son erróneos, y por tanto puede que el resto de la información también
    * De forma similar, "Improvement_surcharge" no debe valer menos de 0.30
* Carreras cuya fecha de fin sea igual o anterior a la fecha de inicio
* Existen tarifas con valores negativos. No tiene sentido ya que la tarifa va en función del tiempo y la distancia recorridas
* "Improvement_surcharge" es un valor en desuso, por lo que debería valer en el menor cas 0, no -0.3

### Elementos extraños PERO posibles

* Número de pasajeros es 0. Dado que es un valor que introduce el propio conductor, muy probablemente le de bastante igual introducir bien el valor.
* Un viaje empieza y acaba en la misma zona.



### Limpieza realizada

A partir de los comportamientos observados se ha procedido a eliminar las carreras que cumplen las siguientes condiciones:

- Campo "tip_amount" con valores menor a 0
- Campo "total_amount" con valores menor o igual a 0
- Campo "trip_distance" con valores menor o igual a 0
- Campo "fare_amount" con valores menor o igual a 0
- Campo "extra" con valores diferentes de 0, 0.5 y 1
- Campo "MTA_tax" con valor distinto de 0.5
- Campo "Improvement_surcharge" con valor distinto de 0 o 0.3
- Campo "tpep_dropoff_datetime" es anterior o igual a "tpep_pickup_datetime"

In [5]:
# Convertimos las fechas a timestamp, para que dejen de ser strings a secas
# y guardamos su diferencia para luego tener más fácil el filtrado y otros cálculos

# ----------------------------------------------------------------------------------
# ESTO ES ABSURDAMENTE LENTO, TIENE QUE HABER ALGUNA FORMA MÁS FÁCIL DE HACER ESTO
# ----------------------------------------------------------------------------------
df = df.withColumn(
    "tpep_pickup_timestamp", unix_timestamp(col("tpep_pickup_datetime").cast("timestamp"))
).withColumn(
    "tpep_dropoff_timestamp", unix_timestamp(col("tpep_dropoff_datetime").cast("timestamp"))
).withColumn(
    "time_diff", col("tpep_dropoff_timestamp") - col("tpep_pickup_timestamp")
)

df.createOrReplaceTempView('datosCarreras')
# display(df.toPandas())

In [16]:
datosLimpios = spark.sql("""
    SELECT * FROM datosCarreras WHERE
        tip_amount >= 0 AND
        total_amount > 0 AND
        trip_distance > 0 AND
        fare_amount > 0 AND
        (extra == 0 OR extra == 0.5 OR extra == 1) AND
        mta_tax == 0.5 AND
        improvement_surcharge >= 0 AND
        time_diff > 0
""")
print(datosLimpios.count())
datosLimpios.createOrReplaceTempView('datosCarrerasLimpios')
datosLimpiosP = datosLimpios.toPandas()

961150


In [7]:
display(datosLimpiosP)
display(datosLimpiosP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp,tpep_dropoff_timestamp,time_diff
0,1,2017-01-09 11:13:28,2017-01-09 11:25:45,1,3.30,1,N,263,161,1,12.5,0.0,0.5,2.00,0.00,0.3,15.30,1483956808,1483957545,737
1,1,2017-01-09 11:32:27,2017-01-09 11:36:01,1,0.90,1,N,186,234,1,5.0,0.0,0.5,1.45,0.00,0.3,7.25,1483957947,1483958161,214
2,1,2017-01-09 11:38:20,2017-01-09 11:42:05,1,1.10,1,N,164,161,1,5.5,0.0,0.5,1.00,0.00,0.3,7.30,1483958300,1483958525,225
3,1,2017-01-09 11:52:13,2017-01-09 11:57:36,1,1.10,1,N,236,75,1,6.0,0.0,0.5,1.70,0.00,0.3,8.50,1483959133,1483959456,323
4,1,2017-01-01 00:00:02,2017-01-01 00:03:50,1,0.50,1,N,48,48,2,4.0,0.5,0.5,0.00,0.00,0.3,5.30,1483225202,1483225430,228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961145,2,2017-01-17 13:16:22,2017-01-17 13:35:14,1,3.43,1,N,264,264,2,14.5,0.0,0.5,0.00,0.00,0.3,15.30,1484655382,1484656514,1132
961146,2,2017-01-17 13:16:23,2017-01-17 13:23:01,1,0.78,1,N,161,237,1,6.0,0.0,0.5,1.36,0.00,0.3,8.16,1484655383,1484655781,398
961147,2,2017-01-17 13:16:23,2017-01-17 13:23:51,1,0.95,1,N,237,229,2,6.5,0.0,0.5,0.00,0.00,0.3,7.30,1484655383,1484655831,448
961148,2,2017-01-17 13:16:23,2017-01-17 13:32:45,1,1.68,1,N,170,48,1,11.5,0.0,0.5,2.46,0.00,0.3,14.76,1484655383,1484656365,982


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,961150.0,1.557943,0.496632,1.0,1.0,2.0,2.0,2.0
passenger_count,961150.0,1.687589,1.293372,0.0,1.0,1.0,2.0,6.0
trip_distance,961150.0,3.016129,3.696633,0.01,1.0,1.7,3.25,139.17
RatecodeID,961150.0,1.026247,0.188597,1.0,1.0,1.0,1.0,6.0
PULocationID,961150.0,160.309,67.864522,1.0,113.0,161.0,231.0,265.0
DOLocationID,961150.0,158.6092,71.978933,1.0,100.0,161.0,232.0,265.0
payment_type,961150.0,1.368578,0.498094,1.0,1.0,1.0,2.0,4.0
fare_amount,961150.0,12.33191,10.246143,0.2,6.5,9.0,14.0,350.0
extra,961150.0,0.2111528,0.255421,0.0,0.0,0.0,0.5,1.0
mta_tax,961150.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


## Extracción de información

Ahora que ya hemos limpiado los datos y tenemos entradas coherentes, se puede proceder a extraer información de los mismos. 

La información que se va a extraer es:

* Velocidad media de los taxis en función de la hora.
* Viajes en taxi más comunes
* Registros financieros (propinas, personas, etc.)
    * Timos a turistas
    * Propinas en función de la hora
    * Identificar pasajeros borrachos
* Zonas con poca cobertura



### Velocidad media de los taxis

En este apartado se realizará un análisis de la velocidad media de los taxis, para ello se realizará una transformación de millas a metros sabiendo que 1 milla = 1609.344 metros luego dividiéndolo entre la diferencia de tiempo calculada previamente.

In [17]:
dfMTS = datosLimpios.withColumn(
    "mean_speed", col("trip_distance")*1609.344/col("time_diff")
)

In [18]:
dfMTSP = dfMTS.toPandas()
display(dfMTSP.sort_values(by=["mean_speed"],ascending=False).head(50))
display(dfMTSP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp,tpep_dropoff_timestamp,time_diff,mean_speed
691398,1,2017-01-01 23:02:31,2017-01-01 23:02:32,1,20.5,2,N,79,79,1,...,0.0,0.5,13.2,0.0,0.3,66.0,1483308151,1483308152,1,32991.552
725483,1,2017-01-16 16:35:18,2017-01-16 16:35:19,1,19.6,1,N,63,63,2,...,1.0,0.5,0.0,0.0,0.3,4.3,1484580918,1484580919,1,31543.1424
571491,1,2017-01-15 19:39:26,2017-01-15 19:39:27,1,19.1,1,N,150,150,2,...,0.0,0.5,0.0,0.0,0.3,3.3,1484505566,1484505567,1,30738.4704
308536,1,2017-01-14 22:59:52,2017-01-14 22:59:53,1,17.4,1,N,164,164,1,...,0.5,0.5,0.0,0.0,0.3,3.8,1484431192,1484431193,1,28002.5856
825394,1,2017-01-16 23:48:05,2017-01-16 23:48:08,1,27.3,2,N,148,148,1,...,0.0,0.5,10.55,0.0,0.3,63.35,1484606885,1484606888,3,14645.0304
669527,1,2017-01-01 20:50:01,2017-01-01 20:50:05,1,26.9,2,N,142,142,2,...,0.0,0.5,0.0,0.0,0.3,52.8,1483300201,1483300205,4,10822.8384
87233,1,2017-01-01 17:01:09,2017-01-01 17:01:12,1,19.1,1,N,151,151,3,...,0.0,0.5,0.0,0.0,0.3,3.3,1483286469,1483286472,3,10246.1568
688967,1,2017-01-01 22:46:25,2017-01-01 22:46:27,1,12.0,1,N,132,132,3,...,0.0,0.5,0.0,0.0,0.3,3.3,1483307185,1483307187,2,9656.064
597025,1,2017-01-15 21:22:57,2017-01-15 21:23:00,1,17.4,1,N,132,132,3,...,0.5,0.5,0.0,0.0,0.3,3.8,1484511777,1484511780,3,9334.1952
525894,1,2017-01-15 16:57:12,2017-01-15 16:57:15,1,17.4,1,N,132,132,3,...,1.0,0.5,0.0,0.0,0.3,4.3,1484495832,1484495835,3,9334.1952


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,961150.0,1.557943,0.496632,1.0,1.0,2.0,2.0,2.0
passenger_count,961150.0,1.687589,1.293372,0.0,1.0,1.0,2.0,6.0
trip_distance,961150.0,3.016129,3.696633,0.01,1.0,1.7,3.25,139.17
RatecodeID,961150.0,1.026247,0.188597,1.0,1.0,1.0,1.0,6.0
PULocationID,961150.0,160.309,67.864522,1.0,113.0,161.0,231.0,265.0
DOLocationID,961150.0,158.6092,71.978933,1.0,100.0,161.0,232.0,265.0
payment_type,961150.0,1.368578,0.498094,1.0,1.0,1.0,2.0,4.0
fare_amount,961150.0,12.33191,10.246143,0.2,6.5,9.0,14.0,350.0
extra,961150.0,0.2111528,0.255421,0.0,0.0,0.0,0.5,1.0
mta_tax,961150.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


En vista de que las velocidades medias estaban mal y esto, como se puede ver en la tabla, es debido a que el time_diff es muy bajo, probablemente por un error de los tiempos almacenador por los taxistas, por lo tanto se volverá a realizar una consulta eliminando tiempos menores a 3 minutos y se comprobará las velocidades promedio otra vez.


In [19]:
datosLimpiosSinVelocidades = spark.sql("SELECT * FROM datosCarrerasLimpios where time_diff >= 180")
dfMTS = datosLimpiosSinVelocidades.withColumn(
    "mean_speed", col("trip_distance")*1609.344/col("time_diff")
)

In [20]:
dfMTSP = dfMTS.toPandas()
display(dfMTSP.sort_values(by=["mean_speed"],ascending=False).head(50))
display(dfMTSP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp,tpep_dropoff_timestamp,time_diff,mean_speed
695619,2,2017-01-16 17:18:50,2017-01-16 17:23:34,1,24.58,2,N,132,50,1,...,0.0,0.5,5.0,5.54,0.3,63.34,1484583530,1484583814,284,139.28759
6000,1,2017-01-01 08:33:03,2017-01-01 08:38:35,1,26.8,1,N,132,132,3,...,0.0,0.5,0.0,0.0,0.3,5.8,1483255983,1483256315,332,129.910901
659906,2,2017-01-02 02:18:28,2017-01-02 02:26:54,1,27.47,1,N,76,148,2,...,0.5,0.5,0.0,0.0,0.3,125.3,1483319908,1483320414,506,87.368932
423928,1,2017-01-15 12:30:22,2017-01-15 12:33:25,1,8.0,1,N,37,37,3,...,0.0,0.5,0.0,0.0,0.3,4.8,1484479822,1484480005,183,70.353836
168363,1,2017-01-01 06:16:23,2017-01-01 06:19:54,1,8.9,1,N,196,196,3,...,0.0,0.5,0.0,0.0,0.3,4.8,1483247783,1483247994,211,67.882282
682256,2,2017-01-16 16:24:39,2017-01-16 16:32:56,2,20.61,2,N,132,130,1,...,0.0,0.5,14.58,5.54,0.3,72.92,1484580279,1484580776,497,66.737585
784668,2,2017-01-17 01:00:33,2017-01-17 01:03:36,1,6.08,1,N,237,116,2,...,0.5,0.5,0.0,0.0,0.3,59.3,1484611233,1484611416,183,53.468915
880463,1,2017-01-17 11:33:42,2017-01-17 11:37:51,1,8.2,1,N,165,165,2,...,0.0,0.5,0.0,15.0,0.3,20.3,1484649222,1484649471,249,52.998477
343612,2,2017-01-15 02:19:35,2017-01-15 03:05:08,1,73.8,5,N,170,260,2,...,0.0,0.5,0.0,5.54,0.3,156.34,1484443175,1484445908,2733,43.457588
736883,1,2017-01-16 19:53:17,2017-01-16 20:01:49,1,12.5,1,N,35,35,1,...,1.0,0.5,0.0,0.0,0.3,8.3,1484592797,1484593309,512,39.290625


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,908566.0,1.557853,0.496642,1.0,1.0,2.0,2.0,2.0
passenger_count,908566.0,1.691012,1.294333,0.0,1.0,1.0,2.0,6.0
trip_distance,908566.0,3.157709,3.748656,0.01,1.1,1.8,3.4,139.17
RatecodeID,908566.0,1.026851,0.188084,1.0,1.0,1.0,1.0,5.0
PULocationID,908566.0,159.8529,67.616472,1.0,113.0,161.0,231.0,265.0
DOLocationID,908566.0,158.028,71.94497,1.0,100.0,161.0,231.0,265.0
payment_type,908566.0,1.360882,0.494393,1.0,1.0,1.0,2.0,4.0
fare_amount,908566.0,12.79407,10.279671,1.05,6.5,9.5,14.5,350.0
extra,908566.0,0.2111707,0.25544,0.0,0.0,0.0,0.5,1.0
mta_tax,908566.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


Como se pued eobservar, la mayoria dde velocidades entre las 50 mas rapidas superan el limite de velocidad nacional para zonas de carretera (24.72222 m/s) siendo que solo los 5 ultimos lo cumplen, o en otras palabras que los 45 primeros infringen la ley.

Por otro lado se puede ver que los 8 primeros tienen velocidades mayores a 52 metros por segundo, lo que implica velocidades de 187.2 km/s esto puede ser debido o a que haya algun tipo de fallo en el tiempo o que lleve velocidades demasiado altas. 

Por uultimo mecionar que los 6 primeros tienen velocidades mayores a 65 m/s, cosa que ya debe ser debido a un fallo, accidental o a drede por parte del conductor.

### Registros financieros

In [None]:
resultsTip = spark.sql("SELECT * FROM datosCarrerasLimpios where tip_amount >= 100").toPandas()
display(resultsTip)
display(resultsTip.describe().T)

In [None]:
resultsTolls = spark.sql("SELECT * FROM datosCarrerasLimpios where tolls_amount > 100").toPandas()
display(resultsTolls)
display(resultsTolls.describe().T)

In [None]:
resultsTimos = spark.sql("SELECT * FROM datosCarrerasLimpios where PULocationID == DOLocationID").toPandas()

In [None]:
display(resultsTimos.sort_values(by=["trip_distance"],ascending=False).head(20))
display(resultsTimos.describe().T)

### Zonas de poca cobertura

### Velocidad media de los taxis

# Fin

In [None]:
#spark.stop()

IDEAS

propinas / hora

Timos

- Vueltas de mas en misma zona
- Tolls valores raros
- Diferencias exageradas de distancias para pares de datos con mismo origen y destino

Velocidad media de los taxis en función de la hora.

Viajes en taxi más comunes

Registros financieros (propinas, personas, etc.)

Zonas sin cobertura a partir del parámetro Store_and_fwd_flag

Fare_amount frente a time_diff y trip distance, infracciones de ley
