# Introduction

In [4]:
#Reset kernel when start this notebook
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

spark = SparkSession.builder.master('local[*]').getOrCreate()

sc = spark.sparkContext

spark

In [5]:
#Let's calculate the difference of time between the pick-up and the drop-off
df["difference_datetime"] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

NameError: name 'df' is not defined

In [None]:
df.head(10)

In [None]:
df["difference_sec"] = df["difference_datetime"].dt.total_seconds()

In [None]:
#Calculate the velocity: milles per hour
df["velocity"] = df["trip_distance"] / ((df["difference_sec"]/60) /60)

In [None]:
#Check the hours of pickup and dropoff
df["pickup_hour"] = df["tpep_pickup_datetime"].apply(lambda x: x.hour)
df["dropoff_hour"] = df["tpep_dropoff_datetime"].apply(lambda x: x.hour)

In [None]:
#Calculate the velocity avg of each hour
hourly_average = df.groupby("pickup_hour")["velocity"].mean()
hourly_average

As we can see, there are NaN values present most of the values. So, let's find what's is causing this.

In [None]:
#Let's check for values lower or equal to zero
df[df["difference_sec"] < 0]["VendorID"].count()

4 rows have difference_sec value below zero. It may indicate we are doing the minus operation between dates wrong. So, let's see the rows

In [None]:
df[df["difference_sec"] < 0]

In [None]:
#Now, let's check the zero values because it can generate errors in the division between distance/time.
df[df["difference_sec"] == 0]

As we can see, there are 4 records which their dropoff datetimes are before the pickup datetime which is incorrect. Also, there are 9339 rows that have the value zero in the column "difference_sec". So, let's remove them

In [None]:
df_cleaned = df[df["difference_sec"] > 0]

In [None]:
df_cleaned.head(10)

In [None]:
hourly_average = df_cleaned[["pickup_hour","velocity"]].groupby("pickup_hour")["velocity"].mean()
hourly_average

In [None]:
temp = hourly_average.to_frame().reset_index()
temp

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, ScalarFormatter

In [None]:
# You can adjust this value to control the center of the color transition
center_value = 10
intensity = (temp.velocity - center_value) / (center_value)

# Create a custom diverging color palette
custom_palette = sns.diverging_palette(250, 10, as_cmap=True)

plt.figure(figsize=(10, 6))
barplot = sns.barplot(data=temp, x="pickup_hour", y="velocity", palette=custom_palette(intensity))

# Show the plot
plt.show()

As we can see, the plot shows the average velocity in each hour of the day. The higher velocities are present before and after midnight with a peak at 5am. It could be due to the low trafic and presence of pedestrians on the streets.

In [None]:
df_filtered = df_cleaned[["PULocationID","DOLocationID","trip_distance","tolls_amount","total_amount"]]
df_filtered.head(10)

Before to continue, let's add the names of the zone for a better understanding

In [None]:
df_zones = pd.read_csv("dataset/taxi_zone_lookup.csv")
df_zones.head(5)

In [None]:
# Merge df_filtered with df_zones for pickup location names
merged_df = df_filtered.merge(df_zones, left_on='PULocationID', right_on='LocationID', how='left')

In [None]:
# Rename the LocationID column to pickup_location_name
merged_df.rename(columns={'Borough': 'BoroughPickUp','ZonePickUp':}, inplace=True)

In [None]:
df_filtered["PUZone"] = df_zones[df_zones.LocationID == df_filtered.PULocationID]

In [None]:
#Let's group the rows based on the origin and the destination of the trips
#Here, we can see the 10 of the most frequent routes 
df_filtered.groupby(["PULocationID","DOLocationID"])["total_amount"].count().sort_values(ascending=False).head(10)

In [None]:
df_filtered.groupby(["PULocationID","DOLocationID"])["total_amount"].sum().sort_values(ascending=False).head(10)

# Data Analysis

* As Spark Dataframes operations are lazy, we need to use show() to trigger the computation and see the results

Data dictionary of this dataset can be found in the following link:
https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [None]:
sdf = spark.read.format("parquet").option("inferSchema", "true").option("timestampFormat","yyyy-MM-dd HH:mm:ss").option("header", "true").option("mode", "DROPMALFORMED").load("dataset/yellow_tripdata_2017-01.parquet")

In [None]:
#Check the type of data of each column
sdf.printSchema()

In [None]:
#See the number of records
sdf.count()

9'710.820 - 4'856.845

## Null values

In [None]:
from pyspark.sql.functions import col, sum, when, expr

In [None]:
# Create a list of expressions that count null values for each column
null_counts = [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in sdf.columns]

# Apply the expressions to the DataFrame and display the result
# The '*' is used to unpack the list and pass each expression as separete arg
# The agg funciton is to apply the expressions to the dataframe 'sdf'
sdf.agg(*null_counts).show()

In [None]:
#The 'congestion_surcharge' and 'airport_fee' columns are null, let's remove them
sdf = sdf.drop(*["congestion_surcharge","airport_fee"])

## Take a sample from the dataset
Select a set of records randomly to limit the size of the dataset, thus executing operations easier

In [None]:
sdf = sdf.sample(withReplacement=False, fraction=0.5, seed=15)
new_size = sdf.count()
print(f"Size of the sample '{new_size}'")

# Estudio #1 -  

# Estudio #2 - 

# Estudio #3 - 

# Estudio #4 - Taxi Velocity average in each hour

In [None]:
#Let's calculate the difference of time between the pick-up and the drop-off
sdf = sdf.withColumn("duration", col("tpep_dropoff_datetime")-col("tpep_pickup_datetime"))
sdf.select("duration").show()

### Check duration of trips

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType

#convert to seconds manually through User defined functions (UDF)
def interval_to_seconds(interval):
    total_seconds = interval.total_seconds()
    return int(total_seconds)

interval_to_seconds_udf = udf(interval_to_seconds, IntegerType())

In [None]:
# Use the UDF to extract seconds and store in a new column
sdf = sdf.withColumn("duration", interval_to_seconds_udf(col("duration")))

# Show the DataFrame with the extracted seconds
sdf.select("duration").show()

In [None]:
#Let's check for some anomalities in this column
sdf.filter(col("duration") < 0).show()

In [None]:
sdf.filter(col("duration") == 0).show()

All the above records shows that all the trips where finished at the same time when started.

Because of all these records, we proceed to remove them

In [None]:
sdf = sdf.filter(col("duration") > 0)
size_after_reduction = sdf.count()
print(f"Size of the sample '{size_after_reduction}'")

### Calculate Velocity of each trip

In [None]:
#As the duration column is in seconds, we have to divide the value between 3600 to the value in hour units.
sdf = sdf.withColumn("velocity",col("trip_distance")/(col("duration")/3600))
sdf.select("velocity").show()

*The units of these values are miles per hour

### Calculate avg per hour

In [None]:
from pyspark.sql.functions import hour

In [None]:
sdf = sdf.withColumn("hour_pickup", hour(col("tpep_pickup_datetime")))
sdf = sdf.withColumn("hour_dropoff", hour(col("tpep_dropoff_datetime")))
sdf.show()

In [None]:
from pyspark.sql.functions import avg, max, min

In [None]:
result_df = sdf.groupBy("hour_pickup").agg(avg("velocity").alias("VelocityAvg")).orderBy("hour_pickup")

# Show the resulting DataFrame
result_df.show(24)

### Visualize results

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, ScalarFormatter

In [None]:
max = result_df.agg(max("VelocityAvg")).collect()[0][0]
min = result_df.agg(min("VelocityAvg")).collect()[0][0]

In [None]:
center_value = (max/min)/2

In [None]:
# Define a custom color mapping based on numeric values
def custom_color_palette(x):
    if x <= center_value:
        return "lightblue"  # Light blue for values less than 50
    else:
        return "lightred"  # Light green for values between 50 and 100

custom_color_palette_udf = udf(custom_color_palette, StringType())

result_df = result_df.withColumn("Color", custom_color_palette_udf(col("VelocityAvg")))

In [None]:
plt.figure(figsize=(10, 6))
result_df_p = result_df.toPandas()
barplot = sns.barplot(data=result_df_p, x="hour_pickup", y="VelocityAvg")
plt.xlabel("Hour")
plt.ylabel("Average velocity")

# Show the plot
plt.show()

# Analisis de rendimiento