### Download data and upload to S3


In [None]:
import os
import requests
import boto3
from os.path import basename
from dotenv import load_dotenv

load_dotenv()

urls = {
    "yellow": "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
    "green": "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "zones": "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"
}

bucket_name = os.getenv("BUCKET_NAME")
s3_prefix = os.getenv("S3_RAW_PREFIX")

if not bucket_name:
    raise ValueError("BUCKET_NAME not set in enviroment")

s3 = boto3.client('s3')

for name, url in urls.items():
    response= requests.get(url, stream=True)
    response.raise_for_status()

    response.raw.decode_content = True

    filename = basename(url)
    s3_key = os.path.join(s3_prefix, filename)

    s3.upload_fileobj(response.raw, bucket_name, s3_key)
    print(f"Uploaded {filename} to s3://{bucket_name}/{s3_key}")

print(f"All url was successfully uploaded")

### Create Spark connection with S3

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os
import findspark

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages org.apache.hadoop:hadoop-aws:3.3.1,"
    "com.amazonaws:aws-java-sdk-bundle:1.11.375,"
    "org.postgresql:postgresql:42.2.27 pyspark-shell"
)

findspark.init()

def create_spark_session(app_name="S3 CSV Reader"):
    spark_conf = {
        "spark.jars.packages": "org.apache.hadoop:hadoop-aws:3.3.1,"
                              "com.amazonaws:aws-java-sdk-bundle:1.11.375",
        "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
        "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
        "spark.sql.adaptive.enabled": "true",
        "spark.sparkContext.setLogLevel" :"INFO"
        
    }

    builder = SparkSession.builder.appName(app_name)
    for k, v in spark_conf.items():
        builder = builder.config(k, v)
    
    spark = builder.getOrCreate()
    return spark

spark = create_spark_session()

### Check that we can read all data


In [None]:
yellow_df = spark.read.parquet("s3a://mypracawsbucketsc2/raw/yellow_tripdata_2023-01.parquet")
yellow_df.show()
yellow_df.columns

In [None]:
green_df = spark.read.parquet("s3a://mypracawsbucketsc2/raw/green_tripdata_2023-01.parquet")
green_df.show()
green_df.columns

In [None]:
taxi_df = spark.read.csv("s3a://mypracawsbucketsc2/raw/taxi+_zone_lookup.csv", header=True, inferSchema=True)
taxi_df.show()

### Join PULocation, DOLocation with taxi zone


In [None]:
# Create a in memory temp-view for df
taxi_df.createOrReplaceGlobalTempView('taxi_zone')
green_df.createOrReplaceGlobalTempView('green_taxi')
yellow_df.createOrReplaceGlobalTempView('yellow_taxi')

SQL_QUERY = """
    
WITH g_cte AS (
        SELECT 
        gt.PULocationID AS pick_up_location, 
        gt.DOLocationID AS drop_off_location,
        g_tz1.zone AS pick_up_zone,
        g_tz2.zone AS drop_off_zone,
        CONCAT(gt.PULocationID, '_', gt.DOLocationID) AS route_id,
        (UNIX_TIMESTAMP(lpep_dropoff_datetime) - UNIX_TIMESTAMP(lpep_pickup_datetime)) / 60 AS trip_duration,
        trip_distance,
        'green' AS taxi_type
        FROM global_temp.green_taxi gt
        LEFT JOIN global_temp.taxi_zone g_tz1 ON gt.PULocationID = g_tz1.LocationID
        LEFT JOIN global_temp.taxi_zone g_tz2 ON gt.DOLocationID= g_tz2.LocationID
    ), 
    y_cte AS (
        SELECT 
        yt.PULocationID AS pick_up_location, 
        yt.DOLocationID AS drop_off_location,
        y_tz1.zone AS pick_up_zone,
        y_tz2.zone AS drop_off_zone,
        CONCAT(yt.PULocationID, '_', yt.DOLocationID) AS route_id,
        (UNIX_TIMESTAMP(tpep_dropoff_datetime) - UNIX_TIMESTAMP(tpep_pickup_datetime)) / 60 AS trip_duration,
        trip_distance,
        'yellow' AS taxi_type
        FROM global_temp.yellow_taxi yt
        LEFT JOIN global_temp.taxi_zone y_tz1 ON yt.PULocationID = y_tz1.LocationID
        LEFT JOIN global_temp.taxi_zone y_tz2 ON yt.DOLocationID= y_tz2.LocationID
    )
    SELECT 
        pick_up_zone,
        drop_off_zone, 
        route_id, 
        COUNT(route_id) as total_trips, 
        ROUND(AVG(trip_duration),2) AS avg_trip_duration,
        ROUND(AVG(trip_distance),2) AS avg_trip_distance,
        taxi_type
    FROM g_cte
    GROUP BY route_id, pick_up_zone, drop_off_zone, taxi_type
    

    UNION ALL

    SELECT 
        pick_up_zone,
        drop_off_zone, 
        route_id, 
        COUNT(route_id) as total_trips, 
        ROUND(AVG(trip_duration),2) AS avg_trip_duration,
        ROUND(AVG(trip_distance),2) AS avg_trip_distance,
        taxi_type
    FROM y_cte
    GROUP BY route_id, pick_up_zone, drop_off_zone, taxi_type
    

"""
from pyspark.sql import functions as F

transformed_df = spark.sql(SQL_QUERY)
transformed_df.show()
transformed_df.orderBy(F.rand()).show(10)

### Saving to as parquet to S3


In [None]:
from dotenv import load_dotenv
load_dotenv()
s3_prefix = os.getenv("S3_PROCESSED_PREFIX")
bucket_name = os.getenv("BUCKET_NAME")

transformed_df.write\
    .mode("append") \
    .parquet(f"s3a://{bucket_name}/{s3_prefix}/task_3_data_parquet/")

print(f"Data successfully written to s3://{bucket_name}/{s3_prefix}task_3_data_parquet/")

### Data from ATHENE
1. Confirm that the number of trips is greater than zero.  

SQ_QUERY =  SELECT pick_up_zone, drop_off_zone, total_trips
            FROM "task_3_glue_db"."task_3_data_parquet"
            WHERE total_trips > 0
            ORDER BY total_trips ASC
            LIMIT 10;


In [None]:
import pandas as pd

data = {
    "pick_up_zone": [
        "Jackson Heights", "Sunset Park East", "Rosedale", "Bay Ridge",
        "Sheepshead Bay", "Old Astoria", "Brooklyn Heights",
        "Briarwood/Jamaica Hills", "North Corona", "Steinway"
    ],
    "drop_off_zone": [
        "East Chelsea", "Central Harlem North", "Hillcrest/Pomonok", "Dyker Heights",
        "JFK Airport", "Yorkville West", "Lincoln Square West",
        "JFK Airport", "Briarwood/Jamaica Hills", "East Harlem South"
    ],
    "total_trips": [1,1,1,1,1,1,1,1,1,1]
}

df = pd.DataFrame(data)
df

### Data from ATHENE
2. Get statistics like the top 10 zone pairs with the highest trip counts.

SQL_QUERY=  SELECT pick_up_zone, drop_off_zone, total_trips
            FROM "task_3_glue_db"."task_3_data_parquet"
            GROUP BY pick_up_zone, drop_off_zone, total_trips
            ORDER BY total_trips desc
            LIMIT 10;

In [None]:
import pandas as pd

data = {
    "pick_up_zone": [
        "Upper East Side South", "Upper East Side North", "N/A", "Upper East Side North",
        "Upper East Side South", "Upper East Side South", "Midtown Center",
        "Midtown Center", "Lenox Hill West", "Lincoln Square East"
    ],
    "drop_off_zone": [
        "Upper East Side North", "Upper East Side South", "N/A", "Upper East Side North",
        "Upper East Side South", "Midtown Center", "Upper East Side South",
        "Upper East Side North", "Upper East Side North", "Upper West Side South"
    ],
    "total_trips": [22303, 18981, 15354, 14926, 14546, 9408, 9320, 8599, 8299, 8198]
}

df = pd.DataFrame(data)
df