In [1]:
from pyspark.sql import SparkSession

We create a Spark session. To monitor, visit <http://localhost:4040/jobs/>.  
Then, we load the data and create a temporary view to work with SQL.

In [2]:
# Spark session.
spark = SparkSession.builder \
    .appName("Cyclistic cleaning data") \
    .getOrCreate()

# Load data.
df = spark.read.csv("Data/cyclistic/2023_cyclistic_tripdata.csv", header=True, inferSchema=True)

# Create temporary view for SQL queries.
df.createOrReplaceTempView("cyclistic_data")

24/02/11 22:14:33 WARN Utils: Your hostname, DS-A90101.local resolves to a loopback address: 127.0.0.1; using 192.168.100.217 instead (on interface en0)
24/02/11 22:14:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/11 22:14:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/11 22:14:52 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

## Checking columns

### `ride_id`
Count the `ride_id` and compare with the total rows. We can see they match, we have no repeated ids.

In [3]:
query = """
SELECT
    COUNT(*) AS Rows,
    COUNT(DISTINCT ride_id) AS Unique_IDs
FROM
    cyclistic_data;
"""

spark.sql(query).toPandas()

24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:14:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:15:03 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,Rows,Unique_IDs
0,5719877,5719877


### `rideable_type`

We can see that there are three unique bicycle types. This columns is also clean.

In [4]:
query = """
SELECT
    rideable_type,
    COUNT(*) AS Rides
FROM
    cyclistic_data
GROUP BY
    rideable_type
ORDER BY
    Rides DESC
"""

spark.sql(query).toPandas()

                                                                                

Unnamed: 0,rideable_type,Rides
0,electric_bike,2945579
1,classic_bike,2696011
2,docked_bike,78287


### Dates

We can see that in both columns, there are entries from the year 2022, and rides that ended in 2024. Since we are interested in data from 2023, we'll remove these entries.


In [5]:
query = """
SELECT
    MIN(started_at),
    MAX(started_at),
    MIN(ended_at),
    MAX(ended_at)
FROM
    cyclistic_data;
"""

spark.sql(query).toPandas()

                                                                                

Unnamed: 0,min(started_at),max(started_at),min(ended_at),max(ended_at)
0,2022-12-31 18:01:58,2023-12-31 17:59:38,2022-12-31 18:02:41,2024-01-01 17:50:51


#### Removing entries that are not from the year 2023

In [6]:
query = """
SELECT
    *
FROM
    cyclistic_data
WHERE
    (started_at >= "2023-01-01 00:00:00" AND
    started_at <= "2023-12-31 23:59:59") AND
    (ended_at >= "2023-01-01 00:00:00" AND
    ended_at <= "2023-12-31 23:59:59")
ORDER BY
    started_at ASC;
"""

df2 = spark.sql(query)
df2.createOrReplaceTempView("cyclistic_data_2")

After removing those entries, our table has 5,718,838 rows.

In [7]:
query = """
SELECT
    COUNT(*) AS Rows
FROM
    cyclistic_data_2
"""

spark.sql(query).toPandas()

                                                                                

Unnamed: 0,Rows
0,5718838


### Station ids and names

We found two issues:
- There is no information about some stations.
- The naming of the stations looks inconsistent in some cases.

#### Removing NAs

We can see that there are rides where we don't have the start information (id, name, latitude, and longitude).  
For simplicity in the analysis, we assume that the data is not enough to infer more information about these stations with the available data (i.e. we cannot infer with latitud and longitude coordinates or station id the name of a missing station).

This is the data that we want to remove.

In [8]:
query = """
SELECT
    *
FROM
    cyclistic_data_2
WHERE
    start_station_name = "NA" OR
    start_station_id = "NA" OR
    start_lat = "NA" OR
    start_lng = "NA" OR
    end_station_name = "NA" OR
    end_station_id = "NA" OR
    end_lat = "NA" OR
    end_lng = "NA"
"""

aux_df = spark.sql(query)
aux_df.createOrReplaceTempView("aux")

Now we remove it from the table and save it into a new one.

In [9]:
query = """
SELECT
    *
FROM
    cyclistic_data_2
WHERE NOT EXISTS (
    SELECT 1
    FROM aux
    WHERE cyclistic_data_2.ride_id = aux.ride_id
);
"""

df3 = spark.sql(query)
df3.createOrReplaceTempView("cyclistic_data_3")

After removing NA rows, 4,330,969 is the new size of our table.

In [10]:
query = """
SELECT
    COUNT(*) AS Rows
FROM
    cyclistic_data_3
"""

spark.sql(query).toPandas()

                                                                                

Unnamed: 0,Rows
0,4330969


#### Character length

It seems that the name is the same as the id for the start station. We will assume that this data is correct and keep it in our table.

In [11]:
query = """
SELECT
    start_station_name,
    SUM(CASE
        WHEN start_station_name = start_station_id THEN 1
        ELSE 0
    END) AS same_name_and_id
FROM
    cyclistic_data_3
GROUP BY
    start_station_name
HAVING
    same_name_and_id > 0
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:31 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,start_station_name,same_name_and_id
0,OH Charging Stx - Test,13
1,410,3


Although there are long name stations, they seem to be valid. Therefore, we'll keep them in our data.

In [12]:
query = """
SELECT
    start_station_name,
    LENGTH(start_station_name) AS length
FROM
    cyclistic_data_3
GROUP BY
    start_station_name
ORDER BY
    length DESC
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:16:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Unnamed: 0,start_station_name,length
0,Public Rack - Penn Elementary School / KIPP As...,64
1,Public Rack - California Ave & Touhy Ave - mid...,51
2,Public Rack - Brighton Park Branch Chicago Lib...,50
3,Public Rack - Austin Branch Chicago Public Lib...,50
4,Public Rack - Cicero Ave & Cortland St - midblock,49
5,Public Rack - Cicero Ave & Le Moyne St - midblock,49
6,Woodlawn & 103rd - Olive Harvey Vaccination Site,48
7,Public Rack - The Montessori School of Englewood,48
8,Public Rack - Sacramento Blvd & Washington Blvd,47
9,Public Rack - Jonathan Y Scammon Public School,46


In [13]:
query = """
SELECT
    start_station_id,
    LENGTH(start_station_id) AS length
FROM
    cyclistic_data_3
GROUP BY
    start_station_id
ORDER BY
    length DESC
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Unnamed: 0,start_station_id,length
0,Hubbard Bike-checking (LBS-WH-TEST),35
1,OH Charging Stx - Test,22
2,6.21.23 OLD HASTINGS,20
3,chargingstx06,13
4,chargingstx07,13
5,TA1308000046,12
6,KA1504000162,12
7,TA1307000134,12
8,TA1305000022,12
9,KA1504000103,12


The same happens with end station information. We also keep this data.

In [14]:
query = """
SELECT
    end_station_name,
    SUM(CASE
        WHEN end_station_name = end_station_id THEN 1
        ELSE 0
    END) AS same_name_and_id
FROM
    cyclistic_data_3
GROUP BY
    end_station_name
HAVING
    same_name_and_id > 0
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:31 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,end_station_name,same_name_and_id
0,410,1
1,OH Charging Stx - Test,13


In [15]:
query = """
SELECT
    end_station_name,
    LENGTH(end_station_name) AS length
FROM
    cyclistic_data_3
GROUP BY
    end_station_name
ORDER BY
    length DESC
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:17:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Unnamed: 0,end_station_name,length
0,Public Rack - Penn Elementary School / KIPP As...,64
1,Public Rack - California Ave & Touhy Ave - mid...,51
2,Public Rack - Brighton Park Branch Chicago Lib...,50
3,Public Rack - Austin Branch Chicago Public Lib...,50
4,Public Rack - Cicero Ave & Cortland St - midblock,49
5,Public Rack - Cicero Ave & Le Moyne St - midblock,49
6,Woodlawn & 103rd - Olive Harvey Vaccination Site,48
7,Public Rack - The Montessori School of Englewood,48
8,Public Rack - Sacramento Blvd & Washington Blvd,47
9,Public Rack - Jonathan Y Scammon Public School,46


In [16]:
query = """
SELECT
    end_station_id,
    LENGTH(end_station_id) AS length
FROM
    cyclistic_data_3
GROUP BY
    end_station_id
ORDER BY
    length DESC
LIMIT
    10;
"""

spark.sql(query).toPandas()

24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/02/11 22:18:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Unnamed: 0,end_station_id,length
0,DIVVY CASSETTE REPAIR MOBILE STATION,36
1,Hubbard Bike-checking (LBS-WH-TEST),35
2,2059 Hastings Warehouse Station,31
3,OH Charging Stx - Test,22
4,6.21.23 OLD HASTINGS,20
5,chargingstx06,13
6,chargingstx07,13
7,TA1307000134,12
8,KA1504000162,12
9,TA1308000046,12


### `member_casual`

We can see that our labels are correct for this column. We expect only two.

In [17]:
query = """
SELECT
    member_casual,
    COUNT(ride_id) AS Rides
FROM
    cyclistic_data
GROUP BY
    member_casual;
"""

spark.sql(query).toPandas()

                                                                                

Unnamed: 0,member_casual,Rides
0,casual,2059179
1,member,3660698


Now the data is clean and we can export it.  
It is important to note that the unbalanced class problem is still present, but we'll solve it using R.

To export the data, we need to repartition the data to get a single csv file.

In [18]:
df3.repartition(1).write.csv("Data/cyclistic/2023_cyclistic_tripdata_clean", header=True)

                                                                                

In [19]:
!mv Data/cyclistic/2023_cyclistic_tripdata_clean/*.csv Data/cyclistic/2023_cyclistic_tripdata_clean.csv

In [20]:
!rm -r Data/cyclistic/2023_cyclistic_tripdata_clean/

In [21]:
!ls Data/cyclistic/

202301-divvy-tripdata.csv         202308-divvy-tripdata.csv
202302-divvy-tripdata.csv         202309-divvy-tripdata.csv
202303-divvy-tripdata.csv         202310-divvy-tripdata.csv
202304-divvy-tripdata.csv         202311-divvy-tripdata.csv
202305-divvy-tripdata.csv         202312-divvy-tripdata.csv
202306-divvy-tripdata.csv         2023_cyclistic_tripdata.csv
202307-divvy-tripdata.csv         2023_cyclistic_tripdata_clean.csv


In [22]:
df = spark.read.csv("Data/cyclistic/2023_cyclistic_tripdata_clean.csv", header=True, inferSchema=True)
df.createOrReplaceTempView("cyclistic_data")

                                                                                

In [23]:
query = """
SELECT
    COUNT(*) AS Rows
FROM
    cyclistic_data;
"""

spark.sql(query).toPandas()

Unnamed: 0,Rows
0,4330969


In [24]:
spark.stop()