In [0]:
import pyspark.pandas as ps

In [0]:
# https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

### Load dataset

In [0]:
df_spark = spark.sql('SELECT * FROM samples.nyctaxi.trips')

In [0]:
df_pandas = df_spark.toPandas()

### To pandas-spark and back

In [0]:
# From pandas to pandas-spark
psdf_from_pandas = ps.from_pandas(df_pandas)

In [0]:
# From spark to pandas-spark
psdf = df_spark.pandas_api()

In [0]:
# From pandas-spark to spark
spark_df = psdf.to_spark()
type(spark_df)

pyspark.sql.dataframe.DataFrame

### Pandas-spark read/write

In [0]:
# psdf.to_csv('foo.csv')
# ps.read_csv('foo.csv').head(10)

# psdf.to_parquet('bar.parquet')
# ps.read_parquet('bar.parquet').head(10)

pyspark.sql.dataframe.DataFrame

### Pandas-spark introductory  

In [0]:
# df.dtypes
# df.isnull().sum()
# df.pickup_zip.value_counts(ascending=False).head()

### Pandas-spark workflow

In [0]:
# Perform some operations with pandas-on-Spark DataFrame
psdf['fare_per_mile'] = psdf['fare_amount'] / psdf['trip_distance']
psdf['trip_type'] = psdf['trip_distance'].apply(lambda x: 'Long Trip' if x > 10 else 'Short Trip')
psdf['pickup_year'] = ps.to_datetime(psdf['tpep_pickup_datetime']).dt.year

In [0]:
# Convert pandas-on-Spark DataFrame to Spark DataFrame
spark_df_new = psdf.to_spark()

In [0]:
# Register the Spark DataFrame as a temporary view
spark_df_new.createOrReplaceTempView("nyctaxi_trips_view")

In [0]:
%sql
SELECT * FROM nyctaxi_trips_view LIMIT 5;

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip,fare_per_mile,trip_type,pickup_year
2016-02-16T22:40:45Z,2016-02-16T22:59:25Z,5.35,18.5,10003,11238,3.457943925233645,Short Trip,2016
2016-02-05T16:06:44Z,2016-02-05T16:26:03Z,6.5,21.5,10282,10001,3.3076923076923075,Short Trip,2016
2016-02-08T07:39:25Z,2016-02-08T07:44:14Z,0.9,5.5,10119,10003,6.111111111111111,Short Trip,2016
2016-02-29T22:25:33Z,2016-02-29T22:38:09Z,3.5,13.5,10001,11222,3.857142857142857,Short Trip,2016
2016-02-03T17:21:02Z,2016-02-03T17:23:24Z,0.3,3.5,10028,10028,11.666666666666668,Short Trip,2016


In [0]:
# Register table to hive catalogue
spark_df_new.write.mode("overwrite").saveAsTable("my_database.my_nyctaxi")

In [0]:
%sql
SELECT * FROM my_database.my_nyctaxi LIMIT 5;

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip,fare_per_mile,trip_type,pickup_year
2016-02-16T22:40:45Z,2016-02-16T22:59:25Z,5.35,18.5,10003,11238,3.457943925233645,Short Trip,2016
2016-02-05T16:06:44Z,2016-02-05T16:26:03Z,6.5,21.5,10282,10001,3.3076923076923075,Short Trip,2016
2016-02-08T07:39:25Z,2016-02-08T07:44:14Z,0.9,5.5,10119,10003,6.111111111111111,Short Trip,2016
2016-02-29T22:25:33Z,2016-02-29T22:38:09Z,3.5,13.5,10001,11222,3.857142857142857,Short Trip,2016
2016-02-03T17:21:02Z,2016-02-03T17:23:24Z,0.3,3.5,10028,10028,11.666666666666668,Short Trip,2016


In [0]:

# Run SQL queries on the temporary view
result_df = spark.sql("""
    SELECT 
        pickup_zip, 
        dropoff_zip, 
        COUNT(*) AS trip_count, 
        AVG(trip_distance) AS avg_distance, 
        SUM(fare_amount) AS total_fare 
    -- FROM nyctaxi_trips_view 
    FROM my_database.my_nyctaxi
    GROUP BY pickup_zip, dropoff_zip 
    ORDER BY trip_count DESC
""")

# Show the result
display(result_df)

pickup_zip,dropoff_zip,trip_count,avg_distance,total_fare
10023,10023,143,0.7644755244755245,798.0
10028,10028,135,0.6650370370370372,722.0
10028,10021,131,0.8712977099236643,777.5
10021,10028,120,0.9373333333333332,684.5
10021,10021,113,0.651858407079646,701.5
10065,10021,111,0.8160360360360359,608.5
10003,10011,110,0.9744545454545456,730.0
10003,10009,107,0.8752336448598131,670.0
10011,10011,102,0.6887254901960784,537.0
10023,10162,100,1.0927,612.5
