# Data discovery: Load and query Yellow Taxi data
Download the dataset from [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

In [7]:
# Import lib
from pyspark.sql import SparkSession

## Create Spark Session. 

In [8]:
spark = SparkSession.builder\
             .master("local[1]")\
             .appName("spark-app-version-x")\
             .getOrCreate()

In [9]:
# Read taxi data. These can be input parameters, using python arguments.
input_path = '../Sources/yellow_tripdata_2023-01.parquet'
output_path = 'reporting/etl_job_taxis_multi_passanger_trips/'

In [10]:
# Load data into Spark dataframe
df = spark.read.option('inferSchema',True).parquet(input_path)
# Query sample, using Spark SQL
df.createOrReplaceTempView('tbl_raw_yellow_taxis')

---

### Let's now use the Spark RDD as a Spark Dataframe

In [11]:
# SQL Statement
df_output = spark.sql('''
                        SELECT VendorID, tpep_pickup_datetime, passenger_count 
                        FROM tbl_raw_yellow_taxis 
                        WHERE total_amount > 1 
                          AND passenger_count > 2
                        ''')

In [12]:
# Write data to output
df_output.write.mode('overwrite').parquet(output_path)

                                                                                

### Use Spark SQL, to join 2 datasets

In [13]:
# Stop the session
spark.stop()
