In [1]:
from pyspark import SparkContext, SparkConf
cf = SparkConf()
cf.set("spark.submit.deployMode","client")
sc = SparkContext.getOrCreate(cf)
from pyspark.sql import SparkSession
spark = SparkSession \
	    .builder \
	    .appName("TransitTrends- Taxi - Data Cleaning") \
	    .config("spark.some.config.option", "some-value") \
	    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/22 01:00:21 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/04/22 01:00:21 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/04/22 01:00:21 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/04/22 01:00:21 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [2]:
from pyspark.sql.functions import lit

# importing green cabs data, dropping unnecessary coloumns and renaming few coloumns to match yellow cabs data:
#green_cabs_df = spark.read.csv(path='/shared/TAXI_SAMPLE/GREEN_CABS/',header=True, inferSchema= True).withColumn("taxi_type", lit("green")).drop("trip_type").withColumnRenamed("lpep_dropoff_datetime", "tpep_dropoff_datetime").withColumnRenamed("lpep_pickup_datetime", "tpep_pickup_datetime")
green_cabs_df = spark.read.parquet("/shared/TAXI/GREEN_CABS/").withColumn("taxi_type", lit("green")).drop("trip_type").withColumnRenamed("lpep_dropoff_datetime", "tpep_dropoff_datetime").withColumnRenamed("lpep_pickup_datetime", "tpep_pickup_datetime")

# importing yellow cabs data:
#yellow_cabs_df = spark.read.csv(path='/shared/TAXI_SAMPLE/YELLOW_CABS/',header=True, inferSchema= True).withColumn("taxi_type", lit("yellow"))
yellow_cabs_df = spark.read.parquet("/shared/TAXI/YELLOW_CABS/").withColumn("taxi_type", lit("yellow"))

# note: sometime there might be .ipynb checkpoints in /shared/GREEN_CABS folder. remove them hdfs before running this cell

ERROR:root:Exception while sending command.                         (0 + 0) / 1]
Traceback (most recent call last):
  File "/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39067)
Traceback (most recent call last):
  File "/opt/conda/miniconda3/lib/python3.8/site-packages/IPyth

Py4JError: An error occurred while calling o77.parquet

In [None]:
print(green_cabs_df.count())
green_cabs_df.show(100)

In [None]:
print(yellow_cabs_df.count())
yellow_cabs_df.show(100)

In [None]:
# combining yellow and green cabs data into a single dataset:
cabs_df = yellow_cabs_df.union(green_cabs_df)
# dropping unnnecessary coloumns:
cabs_df = cabs_df.drop("store_and_fwd_flag")

In [None]:
print(cabs_df.count())
cabs_df.show(100)

In [None]:
import pyspark.sql.functions as F

In [None]:
# Clean the column names
cabs_df = cabs_df.select([F.col(column).alias(column.strip()) for column in cabs_df.columns])
cabs_df.show(100)

In [None]:
# Remove duplicates
cabs_df = cabs_df.dropDuplicates(["taxi_type","total_amount","tpep_pickup_datetime","tpep_dropoff_datetime","pulocationid","dolocationid", "passenger_count", "vendorid", "trip_distance"])
print(cabs_df.count())

In [None]:
#Filter for data in 2019-2021, removing the last week of 2018 from the dataset
cabs_df = cabs_df.filter(F.year("tpep_pickup_datetime").isin([2019, 2020, 2021]))
print(cabs_df.count())

In [None]:
from pyspark.sql.functions import concat
cabs_df = cabs_df.withColumn("pickup_datetime", cabs_df["tpep_pickup_datetime"].cast("timestamp"))
cabs_df = cabs_df.withColumn("month", F.month(cabs_df["pickup_datetime"])).withColumn("year", F.year(cabs_df["pickup_datetime"]))

In [None]:
monthly_records = cabs_df.select("month","year")
#cabs_df = monthly_records.withColumn("month/year", concat(F.col("month"), F.lit("/"), F.col("year"))).drop("month","year")

In [None]:
monthly_records.show(100)

In [None]:
from pyspark.sql.functions import count,asc

count_df = monthly_records.groupBy('year','month').agg(count('*').alias('count')).orderBy(asc('year'), asc('month'))
count_df.show(100)

In [None]:
import matplotlib.pyplot as plt

# Convert the PySpark DataFrame to Pandas DataFrame
pd = count_df.toPandas()

# Create the bar plot using matplotlib
plt.bar(pd['year'].astype(str) + '-' + pd['month'].astype(str), pd['count'])

# Set the title and axis labels
plt.title('Trips per Year-Month')
plt.xlabel('Year-Month')
plt.ylabel('Count')

# Show the plot
plt.show()

In [None]:
cabs_monthly_trips_df = cabs_df.groupBy('month/year').count()

In [None]:
from pyspark.sql.functions import asc
cabs_monthly_trips_df = cabs_monthly_trips_df.orderBy(asc('month/year'))

In [None]:
cabs_monthly_trips_df.show(100)

In [None]:
cabs_monthly_trips_df.count()

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import unix_timestamp, from_unixtime
import pandas as pd


In [None]:
# Convert Spark DataFrame to Pandas DataFrame
pandas_df = cabs_monthly_trips_df.toPandas()

# Plot the data
plt.figure(figsize=(10, 5))
plt.plot(pandas_df['month/year'], pandas_df['count'])
plt.xlabel('Month/Year')
plt.ylabel('Trip Count')
plt.title('Monthly Cab Trips')
plt.show()