#### Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
sns.set_theme(style="darkgrid", palette="deep", rc = {'figure.figsize':(9,6)})

#### Exploratory Data Analysis (EDA)

In [None]:
SEED = 1234
nyc_yellowtaxi_raw_df = spark.read.table("nyc_yellowtaxi_raw")
nyc_yellowtaxi_sampled_df = nyc_yellowtaxi_raw_df.sample(True, 0.001, seed=SEED)
display(nyc_yellowtaxi_sampled_df.summary())

In [None]:
nyc_yellowtaxi_sampled_pd_df = nyc_yellowtaxi_sampled_df.toPandas()
nyc_yellowtaxi_sampled_pd_df['tripDuration'] = (nyc_yellowtaxi_sampled_pd_df['tpepDropoffDateTime'] - nyc_yellowtaxi_sampled_pd_df['tpepPickupDateTime']).astype('timedelta64[m]')
nyc_yellowtaxi_sampled_pd_df['pickupHour'] = nyc_yellowtaxi_sampled_pd_df['tpepPickupDateTime'].dt.hour
nyc_yellowtaxi_sampled_pd_df['dayOfWeek'] = nyc_yellowtaxi_sampled_pd_df['tpepDropoffDateTime'].dt.dayofweek

nyc_yellowtaxi_sampled_pd_df = nyc_yellowtaxi_sampled_pd_df[nyc_yellowtaxi_sampled_pd_df["tripDuration"] > 0]
nyc_yellowtaxi_sampled_pd_df = nyc_yellowtaxi_sampled_pd_df[nyc_yellowtaxi_sampled_pd_df["fareAmount"] > 0]

In [None]:
sns.histplot(data=nyc_yellowtaxi_sampled_pd_df, x="pickupHour", stat="count", discrete=True, kde=True)
plt.title("Distribution by Hour of the day")
plt.xlabel('Hours')
plt.ylabel('Count of trips')

In [None]:
sns.scatterplot(data=nyc_yellowtaxi_sampled_pd_df, x="tripDistance", y="tripDuration", hue="passengerCount")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.boxplot(ax=axes[0], data=nyc_yellowtaxi_sampled_pd_df, x="passengerCount", y="tripDuration").set(title='Distribution of Trip duration by passengerCount')

nyc_yellowtaxi_sampled_clean_pd_df = nyc_yellowtaxi_sampled_pd_df[(nyc_yellowtaxi_sampled_pd_df["passengerCount"] > 0) & (nyc_yellowtaxi_sampled_pd_df["tripDuration"] <= 180)]
sns.boxplot(ax=axes[1], data=nyc_yellowtaxi_sampled_clean_pd_df, x="passengerCount", y="tripDuration").set(title='Distribution of Trip duration by passengerCount (outliers removed)')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(ax =axes[0], data=nyc_yellowtaxi_sampled_pd_df, x="fareAmount", y="tripDuration",  hue="paymentType")

sns.scatterplot(ax =axes[1],data=nyc_yellowtaxi_sampled_pd_df, x="fareAmount", y="tripDuration",  hue="vendorID")

In [None]:
cols_to_corr = ['tripDuration','fareAmount', 'passengerCount', 'tripDistance', 'extra', 'mtaTax', 'tollsAmount', 
       'improvementSurcharge', 'tipAmount', 'pickupHour','dayOfWeek']

sns.heatmap(data = nyc_yellowtaxi_sampled_pd_df[cols_to_corr].corr(),annot=True,fmt='.3f', cmap="Greens")