Extract data

## Extract data
In this notebook we extracted the information from CSV files that we stored in Databricks and write it out to the Delta file system.

#### Riders

In [0]:
%fs
ls dbfs:/FileStore/bikeSharing/data

path,name,size,modificationTime
dbfs:/FileStore/bikeSharing/data/payments.csv,payments.csv,57666115,1687058985000
dbfs:/FileStore/bikeSharing/data/riders.csv,riders.csv,5594949,1687058944000
dbfs:/FileStore/bikeSharing/data/stations.csv,stations.csv,49552,1687058944000
dbfs:/FileStore/bikeSharing/data/trips.csv,trips.csv,440125504,1687059197000


In [0]:
# Load the riders csv file into a dataframe
riders_df = spark.read.option("delimiter", ",") \
    .option("header", "false") \
    .option("inferSchema", "true") \
        .csv("dbfs:/FileStore/bikeSharing/data/riders.csv") \
            .toDF("rider_id","first_name","last_name","address", "birthday", "account_start_date","account_end_date","is_member")
riders_df.show(2)

+--------+----------+---------+-------------------+----------+------------------+----------------+---------+
|rider_id|first_name|last_name|            address|  birthday|account_start_date|account_end_date|is_member|
+--------+----------+---------+-------------------+----------+------------------+----------------+---------+
|    1000|     Diana|    Clark|1200 Alyssa Squares|1989-02-13|        2019-04-23|            null|     true|
|    1001|  Jennifer|    Smith|    397 Diana Ferry|1976-08-10|        2019-11-01|      2020-09-01|     true|
+--------+----------+---------+-------------------+----------+------------------+----------------+---------+
only showing top 2 rows



In [0]:
# Let's display the schema
riders_df.printSchema()

root
 |-- rider_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: date (nullable = true)
 |-- account_start_date: date (nullable = true)
 |-- account_end_date: date (nullable = true)
 |-- is_member: boolean (nullable = true)



In [0]:
# Write out the data into a delta table
riders_df.write.mode('overwrite').format("delta").save("/delta/byke_sharing/riders")

#### Station

In [0]:
# Load the stations csv file into a dataframe
stations_df = spark.read.option("delimiter", ",") \
    .option("header", "false") \
    .option("inferSchema", "true") \
        .csv("dbfs:/FileStore/bikeSharing/data/stations.csv") \
            .toDF("station_id","name","latitude","longitude")
stations_df.show(2)

+------------+--------------------+-----------------+------------------+
|  station_id|                name|         latitude|         longitude|
+------------+--------------------+-----------------+------------------+
|         525|Glenwood Ave & To...|        42.012701|-87.66605799999999|
|KA1503000012|  Clark St & Lake St|41.88579466666667|-87.63110066666668|
+------------+--------------------+-----------------+------------------+
only showing top 2 rows



In [0]:
# display the schema
stations_df.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [0]:
# Write out the data into a delta table
stations_df.write.mode('overwrite').format("delta").save("/delta/byke_sharing/stations")

#### Payments

In [0]:
# Load the payments csv file into a dataframe
payments_df = spark.read.option("delimiter", ",") \
    .option("header", "false") \
    .option("inferSchema", "true") \
        .csv("dbfs:/FileStore/bikeSharing/data/payments.csv") \
            .toDF("payment_id","date","amount","rider_id")
payments_df.show(2)

+----------+----------+------+--------+
|payment_id|      date|amount|rider_id|
+----------+----------+------+--------+
|         1|2019-05-01|   9.0|    1000|
|         2|2019-06-01|   9.0|    1000|
+----------+----------+------+--------+
only showing top 2 rows



In [0]:
# display the schema
payments_df.printSchema()

root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: double (nullable = true)
 |-- rider_id: integer (nullable = true)



In [0]:
# Write out the data into a delta table
payments_df.write.mode('overwrite').format("delta").save("/delta/byke_sharing/payments")

#### Trips

In [0]:
# Load the payments csv file into a dataframe
trips_df = spark.read.option("delimiter", ",") \
    .option("header", "false") \
    .option("inferSchema", "true") \
        .csv("dbfs:/FileStore/bikeSharing/data/trips.csv") \
            .toDF("trip_id","rideable_type","start_at","ended_at", "start_station_id", "end_station_id", "rider_id")
trips_df.show(2)

+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|         trip_id|rideable_type|           start_at|           ended_at|start_station_id|end_station_id|rider_id|
+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|89E7AA6C29227EFF| classic_bike|2021-02-12 16:14:56|2021-02-12 16:21:43|             525|           660|   71934|
|0FEFDE2603568365| classic_bike|2021-02-14 17:52:38|2021-02-14 18:12:09|             525|         16806|   47854|
+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
only showing top 2 rows



In [0]:
# display the schema
trips_df.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- start_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: integer (nullable = true)



In [0]:
# Write out the data into a delta table
trips_df.write.mode('overwrite').format("delta").save("/delta/byke_sharing/trips")