# Imports

In [30]:
import sys, os
is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))

if not is_conda:
    import findspark 
    findspark.init()

from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import datetime
from pyspark.sql.functions import col

# Lectura de datos

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()
df = spark.read.csv('./tripdata_2017_01.csv', header=True, inferSchema=True)

In [3]:
df.printSchema()
dfP=df.toPandas()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)



# Limpieza de datos

In [4]:
display(dfP)
display(dfP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-09 11:13:28,2017-01-09 11:25:45,1,3.30,1,N,263,161,1,12.5,0.0,0.5,2.00,0.00,0.3,15.30
1,1,2017-01-09 11:32:27,2017-01-09 11:36:01,1,0.90,1,N,186,234,1,5.0,0.0,0.5,1.45,0.00,0.3,7.25
2,1,2017-01-09 11:38:20,2017-01-09 11:42:05,1,1.10,1,N,164,161,1,5.5,0.0,0.5,1.00,0.00,0.3,7.30
3,1,2017-01-09 11:52:13,2017-01-09 11:57:36,1,1.10,1,N,236,75,1,6.0,0.0,0.5,1.70,0.00,0.3,8.50
4,2,2017-01-01 00:00:00,2017-01-01 00:00:00,1,0.02,2,N,249,234,2,52.0,0.0,0.5,0.00,0.00,0.3,52.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971005,2,2017-01-17 13:16:22,2017-01-17 13:35:14,1,3.43,1,N,264,264,2,14.5,0.0,0.5,0.00,0.00,0.3,15.30
971006,2,2017-01-17 13:16:23,2017-01-17 13:23:01,1,0.78,1,N,161,237,1,6.0,0.0,0.5,1.36,0.00,0.3,8.16
971007,2,2017-01-17 13:16:23,2017-01-17 13:23:51,1,0.95,1,N,237,229,2,6.5,0.0,0.5,0.00,0.00,0.3,7.30
971008,2,2017-01-17 13:16:23,2017-01-17 13:32:45,1,1.68,1,N,170,48,1,11.5,0.0,0.5,2.46,0.00,0.3,14.76


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,971010.0,1.556951,0.496746,1.0,1.0,2.0,2.0,2.0
passenger_count,971010.0,1.685989,1.291705,0.0,1.0,1.0,2.0,9.0
trip_distance,971010.0,3.031285,3.785847,0.0,1.0,1.7,3.25,151.7
RatecodeID,971010.0,1.044729,0.461631,1.0,1.0,1.0,1.0,99.0
PULocationID,971010.0,160.378031,67.938198,1.0,113.0,161.0,231.0,265.0
DOLocationID,971010.0,158.556712,72.276821,1.0,100.0,161.0,233.0,265.0
payment_type,971010.0,1.371974,0.503326,1.0,1.0,1.0,2.0,4.0
fare_amount,971010.0,13.108188,546.736933,-120.0,6.5,9.0,14.0,538579.2
extra,971010.0,0.209993,0.261634,-1.0,0.0,0.0,0.5,55.54
mta_tax,971010.0,0.497214,0.040586,-0.5,0.5,0.5,0.5,0.5


In [5]:
df.createOrReplaceTempView('datosCarreras')
resultsGrandes = spark.sql("SELECT * FROM datosCarreras where fare_amount >= 10000").toPandas()
resultsPequeños = spark.sql("SELECT * FROM datosCarreras where fare_amount <= 0").toPandas()
resultsDistacia = spark.sql("SELECT * FROM datosCarreras where trip_distance <= 0").toPandas()
print("Grandes")
display(resultsGrandes)
display(resultsGrandes.describe().T)
print("\n\nPequeños")
display(resultsPequeños)
display(resultsPequeños.describe().T)
print("\n\nDistanica = 0")
display(resultsDistacia)
display(resultsDistacia.describe().T)

Grandes


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-01 02:15:10,2017-01-01 02:57:09,1,0.0,1,N,232,243,3,538579.2,0.0,0.5,0.0,0.0,0.3,538580.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
passenger_count,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
trip_distance,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
RatecodeID,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
PULocationID,1.0,232.0,,232.0,232.0,232.0,232.0,232.0
DOLocationID,1.0,243.0,,243.0,243.0,243.0,243.0,243.0
payment_type,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
fare_amount,1.0,538579.2,,538579.2,538579.2,538579.2,538579.2,538579.2
extra,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
mta_tax,1.0,0.5,,0.5,0.5,0.5,0.5,0.5




Pequeños


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2017-01-01 07:02:33,2017-01-01 07:07:55,1,0.86,1,N,236,140,3,-5.0,0.0,-0.5,0.0,0.0,-0.3,-5.8
1,2,2017-01-01 07:15:54,2017-01-01 07:16:01,1,0.00,1,N,263,263,3,-2.5,0.0,-0.5,0.0,0.0,-0.3,-3.3
2,1,2017-01-01 07:17:44,2017-01-01 07:18:29,1,19.00,5,N,33,33,3,0.0,0.0,0.0,0.0,0.0,0.3,0.3
3,1,2017-01-01 07:21:28,2017-01-01 07:23:34,1,0.50,5,N,83,83,3,0.0,0.0,0.0,0.0,0.0,0.3,0.3
4,1,2017-01-01 07:24:20,2017-01-01 07:26:23,1,0.60,5,N,83,260,3,0.0,0.0,0.0,0.0,0.0,0.3,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,2,2017-01-17 12:47:14,2017-01-17 12:48:38,1,0.02,2,N,246,246,4,-52.0,0.0,-0.5,0.0,0.0,-0.3,-52.8
768,2,2017-01-17 12:57:50,2017-01-17 12:58:20,1,0.02,1,N,170,170,3,-2.5,0.0,-0.5,0.0,0.0,-0.3,-3.3
769,1,2017-01-17 12:58:05,2017-01-17 12:58:42,1,0.00,5,N,234,234,2,0.0,0.0,0.0,0.0,0.0,0.3,0.3
770,1,2017-01-17 13:05:34,2017-01-17 13:09:30,1,0.50,5,N,265,265,2,0.0,0.0,0.0,0.0,0.0,0.3,0.3


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,772.0,1.712435,0.452921,1.0,1.0,2.0,2.0,2.0
passenger_count,772.0,1.738342,1.253346,0.0,1.0,1.0,2.0,6.0
trip_distance,772.0,1.437565,4.040759,0.0,0.01,0.2,0.7,37.7
RatecodeID,772.0,2.419689,3.933299,1.0,1.0,1.0,5.0,99.0
PULocationID,772.0,160.417098,70.712051,1.0,113.0,161.0,231.0,265.0
DOLocationID,772.0,169.227979,72.928352,1.0,129.75,164.0,237.0,265.0
payment_type,772.0,3.011658,0.75792,1.0,3.0,3.0,4.0,4.0
fare_amount,772.0,-5.848834,13.347022,-120.0,-4.5,-2.5,0.0,0.0
extra,772.0,-0.165155,0.239413,-1.0,-0.5,0.0,0.0,0.0
mta_tax,772.0,-0.329016,0.238701,-0.5,-0.5,-0.5,0.0,0.5




Distanica = 0


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2017-01-01 00:00:15,2017-01-01 00:00:17,1,0.0,5,N,14,14,1,32.8,0.0,0.5,0.00,0.00,0.3,33.60
1,1,2017-01-01 00:00:34,2017-01-01 00:02:33,1,0.0,1,N,249,249,3,3.0,0.5,0.5,0.00,0.00,0.3,4.30
2,1,2017-01-01 00:00:50,2017-01-01 00:01:31,1,0.0,1,N,138,138,3,2.5,0.5,0.5,0.00,0.00,0.3,3.80
3,1,2017-01-01 00:01:39,2017-01-01 00:01:59,2,0.0,5,N,49,49,1,48.8,0.0,0.0,9.80,0.00,0.3,58.90
4,2,2017-01-01 06:53:23,2017-01-01 06:53:28,2,0.0,5,N,264,42,1,65.0,0.0,0.5,0.00,0.00,0.0,65.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,1,2017-01-17 13:10:55,2017-01-17 13:10:55,2,0.0,1,N,137,264,2,2.5,0.0,0.5,0.00,0.00,0.3,3.30
6266,1,2017-01-17 13:11:15,2017-01-17 13:11:15,1,0.0,1,N,211,264,2,2.5,0.0,0.5,0.00,0.00,0.3,3.30
6267,2,2017-01-17 13:11:29,2017-01-17 13:11:56,2,0.0,2,N,163,163,1,52.0,0.0,0.5,11.67,5.54,0.3,70.01
6268,1,2017-01-17 13:11:57,2017-01-17 13:11:57,1,0.0,1,N,264,264,2,23.0,0.0,0.5,0.00,0.00,0.3,23.80


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,6270.0,1.44386,0.496878,1.0,1.0,1.0,2.0,2.0
passenger_count,6270.0,1.431738,1.04044,0.0,1.0,1.0,1.0,9.0
trip_distance,6270.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RatecodeID,6270.0,2.512121,4.581977,1.0,1.0,1.0,5.0,99.0
PULocationID,6270.0,172.944338,76.097562,1.0,132.0,164.0,246.0,265.0
DOLocationID,6270.0,175.530303,78.016479,1.0,132.0,170.0,261.0,265.0
payment_type,6270.0,1.741148,0.76041,1.0,1.0,2.0,2.0,4.0
fare_amount,6270.0,109.305552,6802.404689,-120.0,2.5,6.0,44.0,538579.2
extra,6270.0,0.136922,0.236858,-0.5,0.0,0.0,0.5,1.0
mta_tax,6270.0,0.386683,0.225308,-0.5,0.5,0.5,0.5,0.5


### Limpieza realizada
Para realizar la limpieza de este dataframe se ha realizado :

- Limpieza de valores de pago grandes
- Limpieza de valores de pago menores o iguales a 0
- Limpieza de valores de distancia recorrida menores o iguales a 0
- Limpieza de valores de numero de pasajeros menor a 1

In [41]:
datosLimpios = spark.sql("SELECT * FROM datosCarreras where fare_amount <= 10000 and fare_amount > 0 and trip_distance > 0 and mta_tax == 0.5 and passenger_count > 0")
# esto no funciona, creo que es por el tipo datetime, deberiamos usar el tipo que viene con spark sql 
"""
pudf = spark.sql("SELECT tpep_pickup_datetime FROM datosCarreras where fare_amount <= 10000 and fare_amount > 0 and trip_distance > 0 and mta_tax == 0.5 and passenger_count > 0")
dodf = spark.sql("SELECT tpep_dropoff_datetime FROM datosCarreras where fare_amount <= 10000 and fare_amount > 0 and trip_distance > 0 and mta_tax == 0.5 and passenger_count > 0")

dordd = dodf.rdd.map(lambda x : {"tpep_dropoff_datetime":datetime(x["tpep_dropoff_datetime"])}).toDF()
purdd = pudf.rdd.map(lambda x : {"tpep_pickup_datetime":datetime(x["tpep_pickup_datetime"])}).toDF()

datosLimpios2 = datosLimpios1.withColumn("tpep_pickup_datetime",pu("tpep_pickup_datetime"))
datosLimpios = datosLimpios2.withColumn("tpep_dropoff_datetime",do("tpep_dropoff_datetime"))
"""
datosLimpiosP = datosLimpios.toPandas()
datosLimpios.createOrReplaceTempView('datosCarrerasLimpios')

In [10]:
display(datosLimpiosP)
display(datosLimpiosP.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-09 11:13:28,2017-01-09 11:25:45,1,3.30,1,N,263,161,1,12.5,0.0,0.5,2.00,0.00,0.3,15.30
1,1,2017-01-09 11:32:27,2017-01-09 11:36:01,1,0.90,1,N,186,234,1,5.0,0.0,0.5,1.45,0.00,0.3,7.25
2,1,2017-01-09 11:38:20,2017-01-09 11:42:05,1,1.10,1,N,164,161,1,5.5,0.0,0.5,1.00,0.00,0.3,7.30
3,1,2017-01-09 11:52:13,2017-01-09 11:57:36,1,1.10,1,N,236,75,1,6.0,0.0,0.5,1.70,0.00,0.3,8.50
4,2,2017-01-01 00:00:00,2017-01-01 00:00:00,1,0.02,2,N,249,234,2,52.0,0.0,0.5,0.00,0.00,0.3,52.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961163,2,2017-01-17 13:16:22,2017-01-17 13:35:14,1,3.43,1,N,264,264,2,14.5,0.0,0.5,0.00,0.00,0.3,15.30
961164,2,2017-01-17 13:16:23,2017-01-17 13:23:01,1,0.78,1,N,161,237,1,6.0,0.0,0.5,1.36,0.00,0.3,8.16
961165,2,2017-01-17 13:16:23,2017-01-17 13:23:51,1,0.95,1,N,237,229,2,6.5,0.0,0.5,0.00,0.00,0.3,7.30
961166,2,2017-01-17 13:16:23,2017-01-17 13:32:45,1,1.68,1,N,170,48,1,11.5,0.0,0.5,2.46,0.00,0.3,14.76


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,961168.0,1.557947,0.496631,1.0,1.0,2.0,2.0,2.0
passenger_count,961168.0,1.687614,1.293397,1.0,1.0,1.0,2.0,6.0
trip_distance,961168.0,3.016284,3.696891,0.01,1.0,1.7,3.25,139.17
RatecodeID,961168.0,1.026237,0.188439,1.0,1.0,1.0,1.0,6.0
PULocationID,961168.0,160.308426,67.864081,1.0,113.0,161.0,231.0,265.0
DOLocationID,961168.0,158.60903,71.978685,1.0,100.0,161.0,232.0,265.0
payment_type,961168.0,1.368582,0.498095,1.0,1.0,1.0,2.0,4.0
fare_amount,961168.0,12.332373,10.247179,0.2,6.5,9.0,14.0,350.0
extra,961168.0,0.211165,0.255497,0.0,0.0,0.0,0.5,4.5
mta_tax,961168.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


In [11]:
resultsTip = spark.sql("SELECT * FROM datosCarrerasLimpios where tip_amount >= 100").toPandas()
display(resultsTip)
display(resultsTip.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2017-01-01 10:45:39,2017-01-01 11:43:13,1,33.23,1,N,100,265,1,91.5,0.0,0.5,102.1,5.54,0.3,199.94
1,2,2017-01-01 02:46:12,2017-01-01 03:16:17,1,16.85,1,N,125,183,1,47.0,0.5,0.5,200.0,5.0,0.3,253.3
2,2,2017-01-01 03:07:59,2017-01-01 03:28:25,1,6.5,1,N,48,42,1,21.5,0.5,0.5,108.0,0.0,0.3,130.8
3,1,2017-01-01 04:43:47,2017-01-01 05:22:40,1,7.9,1,N,186,36,1,30.0,0.5,0.5,110.0,0.0,0.3,141.3
4,2,2017-01-14 19:23:23,2017-01-14 19:29:02,5,1.06,1,N,249,231,1,6.0,0.0,0.5,100.0,0.0,0.3,106.8
5,2,2017-01-15 02:36:50,2017-01-15 02:47:27,5,2.86,1,N,79,141,1,10.5,0.5,0.5,366.0,0.0,0.3,377.8
6,2,2017-01-15 03:38:21,2017-01-15 04:09:18,5,18.41,1,N,232,19,1,51.5,0.5,0.5,306.0,0.0,0.3,358.8
7,2,2017-01-15 18:20:51,2017-01-15 18:41:07,1,6.07,1,N,88,230,1,21.5,0.0,0.5,100.0,0.0,0.3,122.3
8,2,2017-01-01 17:44:03,2017-01-01 18:17:32,1,1.92,1,N,50,233,1,19.5,0.0,0.5,300.0,0.0,0.3,320.3
9,2,2017-01-16 19:03:39,2017-01-16 19:37:42,1,26.74,2,N,135,265,1,52.0,0.0,0.5,100.0,5.54,0.3,158.34


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,12.0,1.916667,0.2886751,1.0,2.0,2.0,2.0,2.0
passenger_count,12.0,2.0,1.809068,1.0,1.0,1.0,2.0,5.0
trip_distance,12.0,10.735833,10.70147,0.51,2.625,6.64,17.24,33.23
RatecodeID,12.0,1.083333,0.2886751,1.0,1.0,1.0,1.0,2.0
PULocationID,12.0,134.166667,72.42279,48.0,85.75,112.5,197.0,249.0
DOLocationID,12.0,159.916667,92.07256,19.0,76.5,184.5,231.5,265.0
payment_type,12.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
fare_amount,12.0,31.833333,24.84345,6.0,17.25,23.25,48.125,91.5
extra,12.0,0.25,0.2611165,0.0,0.0,0.25,0.5,0.5
mta_tax,12.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


In [12]:
resultsTolls = spark.sql("SELECT * FROM datosCarrerasLimpios where tolls_amount > 100").toPandas()
display(resultsTolls)
display(resultsTolls.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-14 22:23:50,2017-01-14 22:44:18,2,7.8,1,Y,164,196,4,23.5,0.5,0.5,0.0,905.54,0.3,930.34
1,1,2017-01-14 23:10:17,2017-01-14 23:27:57,1,4.9,1,Y,158,195,3,18.0,0.5,0.5,0.0,400.07,0.3,419.37
2,1,2017-01-16 15:57:40,2017-01-16 16:31:57,1,11.2,1,N,264,264,2,34.0,0.0,0.5,0.0,345.34,0.3,380.14


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,3.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
passenger_count,3.0,1.333333,0.57735,1.0,1.0,1.0,1.5,2.0
trip_distance,3.0,7.966667,3.153305,4.9,6.35,7.8,9.5,11.2
RatecodeID,3.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PULocationID,3.0,195.333333,59.542702,158.0,161.0,164.0,214.0,264.0
DOLocationID,3.0,218.333333,39.551654,195.0,195.5,196.0,230.0,264.0
payment_type,3.0,3.0,1.0,2.0,2.5,3.0,3.5,4.0
fare_amount,3.0,25.166667,8.129166,18.0,20.75,23.5,28.75,34.0
extra,3.0,0.333333,0.288675,0.0,0.25,0.5,0.5,0.5
mta_tax,3.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


In [20]:
resultsTimos = spark.sql("SELECT * FROM datosCarrerasLimpios where PULocationID == DOLocationID").toPandas()

In [21]:
display(resultsTimos.sort_values(by=["trip_distance"],ascending=False).head(20))
display(resultsTimos.describe().T)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
18697,2,2017-01-14 21:46:25,2017-01-14 23:19:05,4,39.2,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
26296,2,2017-01-15 10:07:26,2017-01-15 10:41:35,6,39.12,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
42570,2,2017-01-01 22:28:10,2017-01-01 23:33:14,2,37.17,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
36314,2,2017-01-15 20:09:09,2017-01-15 21:10:43,1,36.02,1,N,264,264,1,96.0,0.5,0.5,0.0,0.0,0.3,97.3
50205,1,2017-01-16 22:34:33,2017-01-16 23:32:47,4,35.7,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
2981,2,2017-01-01 13:35:44,2017-01-01 15:00:50,1,35.51,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
47445,2,2017-01-16 18:53:04,2017-01-16 20:10:38,1,34.88,2,N,132,132,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8
49263,2,2017-01-16 20:59:30,2017-01-16 22:03:37,2,34.63,2,N,132,132,2,52.0,0.0,0.5,0.0,5.54,0.3,58.34
7955,1,2017-01-01 02:31:37,2017-01-01 03:32:14,1,34.0,1,N,216,216,1,92.5,0.5,0.5,0.0,0.0,0.3,93.8
1803,1,2017-01-01 11:49:19,2017-01-01 13:31:51,4,31.7,1,N,264,264,2,98.5,0.0,0.5,0.0,0.0,0.3,99.3


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,59750.0,1.492803,0.4999524,1.0,1.0,1.0,2.0,2.0
passenger_count,59750.0,1.620251,1.227284,1.0,1.0,1.0,2.0,6.0
trip_distance,59750.0,1.365137,2.402353,0.01,0.46,0.7,1.19,39.2
RatecodeID,59750.0,1.017138,0.184617,1.0,1.0,1.0,1.0,6.0
PULocationID,59750.0,185.464285,78.3636,1.0,132.0,230.0,263.0,265.0
DOLocationID,59750.0,185.464285,78.3636,1.0,132.0,230.0,263.0,265.0
payment_type,59750.0,1.472619,0.5404941,1.0,1.0,1.0,2.0,4.0
fare_amount,59750.0,7.391777,7.844194,0.2,4.0,5.0,7.0,260.0
extra,59750.0,0.209531,0.2550441,0.0,0.0,0.0,0.5,1.0
mta_tax,59750.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5


Timo a turistas

# Fin

In [None]:
spark.stop()

IDEAS

propinas / hora

Timos

- Vueltas de mas en misma zona
- Tolls valores raros
- Diferencias exageradas de distancias para pares de datos con mismo origen y destino

Velocidad media de los taxis en función de la hora.

Viajes en taxi más comunes

Registros financieros (propinas, personas, etc.)
