In [1]:
import os

pyspark_submit_args= ' --master local[3] pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"]=pyspark_submit_args

# spark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import col
import pyspark.sql.functions as fn
from pyspark.sql.catalog import Catalog
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType, FloatType

In [2]:
# Start the SparkSession
conf = SparkConf()
conf.set('spark.executor.memory','16g')
conf.set('spark.driver.memory','8g')
conf.set('spark.driver.cores','4')
conf.set('spark.num.executors','10')
conf.set('spark.executor.cores','4')

spark = (SparkSession
    .builder
    .config(conf=conf)
    .appName("taxis trips data")
    .getOrCreate()
)

In [3]:
import geopandas as gpd
from shapely.geometry import Point



shp_path = os.path.join('./', 'nyc-taxi-zone.shp')
nyc_taxi_zones = gpd.GeoDataFrame.from_file(shp_path)
nyc_taxi_zones.to_crs(epsg=4326,inplace=True)
nyc_taxi_zones['point'] = nyc_taxi_zones.representative_point()



nyc_taxi_zones.head()

Unnamed: 0,borough,location_i,objectid,shape_area,shape_leng,zone,geometry,point
0,EWR,1.0,1.0,0.000782,0.116357,Newark Airport,"POLYGON ((-74.18445 40.69500, -74.18449 40.695...",POINT (-74.17678 40.68951)
1,Queens,2.0,2.0,0.004866,0.43347,Jamaica Bay,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...",POINT (-73.82614 40.62572)
2,Bronx,3.0,3.0,0.000314,0.084341,Allerton/Pelham Gardens,"POLYGON ((-73.84793 40.87134, -73.84725 40.870...",POINT (-73.84948 40.86587)
3,Manhattan,4.0,4.0,0.000112,0.043567,Alphabet City,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",POINT (-73.97702 40.72415)
4,Staten Island,5.0,5.0,0.000498,0.092146,Arden Heights,"POLYGON ((-74.17422 40.56257, -74.17349 40.562...",POINT (-74.18994 40.55034)


In [4]:
#import re

#def get_jwd(point):
#    fs = re.findall('\((.*)\)',str(point))
#    if fs:
#        lon,lat = fs[0].strip(' ').split(' ')
#        lon = float(lon)
#        lat = float(lat)
#        return [lon,lat]


#nyc_taxi_zones['lon'] = nyc_taxi_zones['point'].apply(lambda x:get_jwd(x)[0])
#nyc_taxi_zones['lat'] = nyc_taxi_zones['point'].apply(lambda x:get_jwd(x)[1])

#nyc_taxi_zones.head()

In [5]:
long_min = -74.10
long_max = -73.70
lat_min = 40.58
lat_max = 40.90


global mark

mark = 0


#def get_locationId(longitude,latitude):
#    global mark
#    mark += 1
#    if mark % 10000 == 0:
#        print(mark/100000)
#    if longitude >= long_min and longitude <= long_max and latitude >= lat_min and latitude <= lat_max:
#        pnts = Point(longitude,latitude)
#        for i,j in zip(nyc_taxi_zones['location_i'].values,nyc_taxi_zones['geometry'].values):
#            if pnts.within(j):
#                return i
#    return -1

def get_locationId(longitude,latitude):
    global mark
    mark += 1
    if mark % 10000 == 0:
        print(mark/100000)
    pnts = Point(longitude,latitude)
    for i,j in zip(nyc_taxi_zones['location_i'].values,nyc_taxi_zones['geometry'].values):
        if pnts.within(j):
            return int(i)
    return -1

udf_get_locationId = fn.udf(lambda x,y : get_locationId(x,y), IntegerType())

In [6]:
import pandas as pd
#加工15年数据


#df201507 = pd.read_csv('yellow_tripdata_2015-07.csv')

df201507 = spark.read\
                .format('csv')\
                .option("header","true")\
                .option("mode","FAILFAST")\
                .option("inferSchema", "true")\
                .option("sep",",")\
                .load('yellow_tripdata_2015-07.csv')

df201507.show()

+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RatecodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       1| 2015-07-01 00:00:00|  2015-07-01 00:15:26|              1|          3.5|-73.99415588378906| 40.75112533569336|         1|    

In [7]:
import time
t1 = time.time()
#df201507 = df201507.withColumn('PULocationID', udf_get_locationId(col('pickup_longitude'), col('pickup_latitude')))
#df201507 = df201507.withColumn('DOLocationID', udf_get_locationId(col('dropoff_longitude'), col('dropoff_latitude')))

df201507 = df201507.withColumn('PULocationID', udf_get_locationId(df201507['pickup_longitude'], df201507['pickup_latitude']))
df201507 = df201507.withColumn('DOLocationID', udf_get_locationId(df201507['dropoff_longitude'], df201507['dropoff_latitude']))


t2 = time.time()
print(str(int(t2-t1)) + 'sec')
#df201507.head()
df201507.show()
#df201507.to_csv('yellow_tripdata_2015-07_add.csv')

0sec
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RatecodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|PULocationID|DOLocationID|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+------------+
|       1| 2015-07-01 00:00:00|  2015-07-01 00:15:26|

In [8]:
#nyc_taxi_zones = nyc_taxi_zones[['LocationID','zone','borough','lon','lat']]
nyc_taxi_zones = nyc_taxi_zones[['location_i','zone','borough']]
schema = StructType([
StructField("location_i", FloatType(), True),
StructField("zone", StringType(), True),
StructField("borough", StringType(), True)])
nyc_taxi_zones = spark.createDataFrame(nyc_taxi_zones, schema)

In [9]:
nyc_taxi_zones.show()

+----------+--------------------+-------------+
|location_i|                zone|      borough|
+----------+--------------------+-------------+
|       1.0|      Newark Airport|          EWR|
|       2.0|         Jamaica Bay|       Queens|
|       3.0|Allerton/Pelham G...|        Bronx|
|       4.0|       Alphabet City|    Manhattan|
|       5.0|       Arden Heights|Staten Island|
|       6.0|Arrochar/Fort Wad...|Staten Island|
|       7.0|             Astoria|       Queens|
|       8.0|        Astoria Park|       Queens|
|       9.0|          Auburndale|       Queens|
|      24.0|        Bloomingdale|    Manhattan|
|      10.0|        Baisley Park|       Queens|
|      11.0|          Bath Beach|     Brooklyn|
|      12.0|        Battery Park|    Manhattan|
|      13.0|   Battery Park City|    Manhattan|
|      18.0|        Bedford Park|        Bronx|
|      25.0|         Boerum Hill|     Brooklyn|
|      14.0|           Bay Ridge|     Brooklyn|
|      15.0|Bay Terrace/Fort ...|       

In [None]:
#df201507_pickup = pd.merge(left=df201507,right=nyc_taxi_zones,how='inner',left_on='PULocationID',right_on='location_i')
#df201507_pickup['pickup_zone'] = df201507_pickup['zone']
#df201507_pickup['pickup_borough'] = df201507_pickup['borough']
#df201507_pickup.head()
df201507=df201507.limit(1000)
join_rule = df201507.PULocationID == nyc_taxi_zones.location_i
df201507_pickup = df201507.join(nyc_taxi_zones,join_rule , "inner").show()


In [None]:
df201507_pickup.withColumnRenamed("zone", "pickup_zone")
df201507_pickup.withColumnRenamed("borough", "pickup_borough")
df201507_pickup.show()

In [None]:
pick_up = list(df201507_pickup.columns)
pick_up.remove('location_i')
pick_up.remove('zone')
pick_up.remove('borough')
#pick_up.remove('lon')
#pick_up.remove('lat')
print(pick_up)

In [None]:
df201507 = df201507_pickup[pick_up]

In [None]:
df201507_dropoff = pd.merge(left=df201507,right=nyc_taxi_zones,how='inner',left_on='DOLocationID',right_on='location_i')
#df201507_dropoff['dropoff_longitude'] = df201507_dropoff['lon']
#df201507_dropoff['dropoff_latitude'] = df201507_dropoff['lat']
df201507_dropoff['dropoff_zone'] = df201507_dropoff['zone']
df201507_dropoff['dropoff_borough'] = df201507_dropoff['borough']
df201507_dropoff.head()

In [None]:
dropoff = list(df201507_dropoff.columns)
dropoff.remove('location_i')
dropoff.remove('zone')
dropoff.remove('borough')
#dropoff.remove('lon')
#dropoff.remove('lat')
print(dropoff)

In [None]:
df201507 = df201507_dropoff[dropoff]
df201507.to_csv('yellow_tripdata_2015-07_traite.csv')

In [None]:
df201807 = pd.read_csv('yellow_tripdata_2018-07.csv')
df201807.head()

In [None]:
df201807_pickup = pd.merge(left=df201807,right=nyc_taxi_zones,how='inner',left_on='PULocationID',right_on='location_i')
#df201807_pickup['pickup_longitude'] = df201807_pickup['lon']
#df201807_pickup['pickup_latitude'] = df201807_pickup['lat']
df201807_pickup['pickup_zone'] = df201807_pickup['zone']
df201807_pickup['pickup_borough'] = df201807_pickup['borough']
df201807_pickup.head()

In [None]:
pick_up = list(df201807_pickup.columns)
pick_up.remove('location_i')
pick_up.remove('zone')
pick_up.remove('borough')
#pick_up.remove('lon')
#pick_up.remove('lat')
print(pick_up)

In [None]:
df201807 = df201807_pickup[pick_up]

In [None]:
df201807_dropoff = pd.merge(left=df201807,right=nyc_taxi_zones,how='inner',left_on='DOLocationID',right_on='location_i')
#df201807_dropoff['dropoff_longitude'] = df201807_dropoff['lon']
#df201807_dropoff['dropoff_latitude'] = df201807_dropoff['lat']
df201807_dropoff['dropoff_zone'] = df201807_dropoff['zone']
df201807_dropoff['dropoff_borough'] = df201807_dropoff['borough']
df201807_dropoff.head()

In [None]:
df201807 = df201807_dropoff[dropoff]
df201807.head()

In [None]:
df201807.to_csv('yellow_tripdata_2018-07_traite.csv')