In [0]:
%python
from pyspark import SparkContext, SparkConf
cf = SparkConf()
cf.set("spark.submit.deployMode","client")
sc = SparkContext.getOrCreate(cf)
from pyspark.sql import SparkSession
spark = SparkSession \
	    .builder \
	    .appName("TransitTrends- Taxi - Data Cleaning") \
	    .config("spark.some.config.option", "some-value") \
	    .getOrCreate()

In [1]:
%python
from pyspark.sql.functions import lit

# importing green cabs data, dropping unnecessary coloumns and renaming few coloumns to match yellow cabs data:
#green_cabs_df = spark.read.csv(path='/shared/TAXI_SAMPLE/GREEN_CABS/',header=True, inferSchema= True).withColumn("taxi_type", lit("green")).drop("trip_type").withColumnRenamed("lpep_dropoff_datetime", "tpep_dropoff_datetime").withColumnRenamed("lpep_pickup_datetime", "tpep_pickup_datetime")
green_cabs_df = spark.read.parquet("/shared/TAXI/GREEN_CABS/").withColumn("taxi_type", lit("green")).drop("trip_type").withColumnRenamed("lpep_dropoff_datetime", "tpep_dropoff_datetime").withColumnRenamed("lpep_pickup_datetime", "tpep_pickup_datetime")

# importing yellow cabs data:
#yellow_cabs_df = spark.read.csv(path='/shared/TAXI_SAMPLE/YELLOW_CABS/',header=True, inferSchema= True).withColumn("taxi_type", lit("yellow"))
yellow_cabs_df = spark.read.parquet("/shared/TAXI/YELLOW_CABS/").withColumn("taxi_type", lit("yellow"))

In [2]:
%python
print(green_cabs_df.count())
green_cabs_df.show(100)

In [3]:
%python
print(yellow_cabs_df.count())
yellow_cabs_df.show(100)

In [4]:
%python
# combining yellow and green cabs data into a single dataset:
cabs_df = yellow_cabs_df.union(green_cabs_df)
# dropping unnnecessary coloumns:
cabs_df = cabs_df.drop("store_and_fwd_flag")

In [5]:
%python
print(cabs_df.count())
cabs_df.show(100)

In [6]:
%python
import pyspark.sql.functions as F

In [7]:
%python
# Clean the column names
cabs_df = cabs_df.select([F.col(column).alias(column.strip()) for column in cabs_df.columns])
cabs_df.show(100)

In [8]:
%python
# Remove duplicates
cabs_df = cabs_df.dropDuplicates(["taxi_type","total_amount","tpep_pickup_datetime","tpep_dropoff_datetime","pulocationid","dolocationid", "passenger_count", "vendorid", "trip_distance"])
cabs_df.count()

In [9]:
%python
#Filter for data in 2019-2021, removing the last week of 2018 from the dataset
cabs_df = cabs_df.filter(F.year("tpep_pickup_datetime").isin([2019, 2020, 2021]))
cabs_df.count()

In [10]:
%python
from pyspark.sql.functions import concat
cabs_df = cabs_df.withColumn("pickup_datetime", cabs_df["tpep_pickup_datetime"].cast("timestamp"))
cabs_df = cabs_df.withColumn("month", F.month(cabs_df["pickup_datetime"])).withColumn("year", F.year(cabs_df["pickup_datetime"]))
cabs_df = cabs_df.withColumn("month/year", concat(F.col("month"), F.lit("/"), F.col("year"))).drop("month","year")

In [11]:
%python
cabs_df.show(100)

In [12]:
%python
cabs_monthly_trips_df = cabs_df.groupBy('month/year').count()

In [13]:
%python
cabs_monthly_trips_df.show(100)

In [14]:
%python
cabs_monthly_trips_df.count()

In [15]:
%python
import matplotlib.pyplot as plt
from pyspark.sql.functions import unix_timestamp, from_unixtime

# convert month/year string to unix timestamp
df = df.withColumn('timestamp', unix_timestamp('month/year', 'MM/yyyy'))

# convert unix timestamp to date string
df = df.withColumn('date', from_unixtime('timestamp', 'yyyy-MM-dd'))

# sort by timestamp
df = df.sort('timestamp')

# create x and y axis data
x = df.select('date').rdd.flatMap(lambda x: x).collect()
y = df.select('count').rdd.flatMap(lambda x: x).collect()

# plot bar graph
plt.bar(x, y)
plt.xticks(rotation=45)
plt.xlabel('Month/Year')
plt.ylabel('Count')
plt.show()