In [1]:
import os
import pandas as pd
import pyspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types

In [2]:
rootpath = os.path.dirname(os.path.abspath(""))
datapath = os.path.join(rootpath, 'data')
print(f"datapath: {datapath}")

datapath: /home/onur/WORK/DS/repos/DE/de_zoomcamp_nytaxi_spark/data


In [3]:
gcs_connector_fname = "gcs-connector-hadoop3-2.2.5.jar"
lib_path_local = os.path.join(rootpath, "lib", gcs_connector_fname)
lib_path_gs = f'gs://hadoop-lib/gcs/{gcs_connector_fname}'
import subprocess

subprocess.run(["gsutil", "cp", lib_path_gs, lib_path_local])

Copying gs://hadoop-lib/gcs/gcs-connector-hadoop3-2.2.5.jar...
/ [1 files][ 30.1 MiB/ 30.1 MiB]                                                
Operation completed over 1 objects/30.1 MiB.                                     


CompletedProcess(args=['gsutil', 'cp', 'gs://hadoop-lib/gcs/gcs-connector-hadoop3-2.2.5.jar', '/home/onur/WORK/DS/repos/DE/de_zoomcamp_nytaxi_spark/lib/gcs-connector-hadoop3-2.2.5.jar'], returncode=0)

In [10]:
credentials_location = '/home/onur/gcp-keys/ny-taxi-453114-ff980554600b.json'
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('gcs') \
    .set("spark.jars", lib_path_local) \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [11]:
sc = SparkContext(conf=conf)

In [12]:
hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [13]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

### Read and Combine Green&Yellow Data

In [14]:
df_green = spark.read.parquet('gs://ny-taxi-spark/pq/green/*/*')

                                                                                

In [15]:
df_green.columns

['VendorID',
 'lpep_pickup_datetime',
 'lpep_dropoff_datetime',
 'store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'passenger_count',
 'trip_distance',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'ehail_fee',
 'improvement_surcharge',
 'total_amount',
 'payment_type',
 'trip_type',
 'congestion_surcharge']

In [16]:
df_green.count()

                                                                                

2304517

In [17]:
df_green.show()

                                                                                

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2020-01-23 14:10:15|  2020-01-23 14:38:16|                 N|         1|          74|         130|              1|        12.77|       36.0|  0.0|    0.

In [18]:
spark.sparkContext.stop()