# University Project II - Big Data

Author : Ophiase

In [71]:
ENABLE_DOWNLOAD = False
ENABLE_UNZIP = False
ENABLE_COMPUTE_ZIP_TO_PARQUET = False

### Dependencies

In [72]:
import shutil
import os
import requests
import pandas as pd
import re
import pyspark.sql.functions as F
import zipfile
from functools import reduce

In [73]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import matplotlib.pyplot as plt
import pyspark
import logging

logging.getLogger("pyspark").setLevel(logging.ERROR)
# os.environ["PYSPARK_SUBMIT_ARGS"] = "--driver-memory 2g"

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import when, mean, stddev, skewness, kurtosis, expr, date_format
import pyspark.sql.functions as pf
from pyspark.sql.functions import when, col, lit


import altair as alt
import plotly
import plotly.express as px

import scipy
from scipy.stats import skew, kurtosis

import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [74]:
spark = SparkSession.builder \
    .appName("Big Data Project") \
    .config("spark.driver.memory", "3g") \
    .getOrCreate()

## Download the DATA

In [75]:
# from 2014 to 2023
trip_urls = [
    (year, f"https://s3.amazonaws.com/tripdata/{year}-citibike-tripdata.zip")
    for year in range(2014, 2023 + 1)
]

if not os.path.exists(os.path.join('data')) :
    os.makedirs('data')

zip_files = []

for year, url in trip_urls:
    basename = os.path.join('data', str(year) + "_" + 'citibike_tripdata')
    zip_filename = basename + ".zip"
    csv_filename = basename + ".csv"
    zip_files.append((year, zip_filename, csv_filename))

    if not ENABLE_DOWNLOAD : continue
    print(f'Check {basename} ...')

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    progress = 0

    if not os.path.exists(zip_filename) and not os.path.exists(csv_filename) :
        with open(zip_filename, 'wb') as f:
            for data in response.iter_content(block_size):
                if data:
                    f.write(data)
                    progress += len(data)
                    print(f'\rDownloaded {progress}/{total_size} bytes', end='')

        print(f'\nDownload complete: {zip_filename}')

print("Finished")


Finished


In [76]:
if ENABLE_UNZIP :
    for (year, zip_filename, csv_filename) in zip_files:
        # if year < 2018: continue # WARNING : DISABLE THIS LINE

        if not zipfile.is_zipfile(zip_filename):
            print("Corrupted zip file.")
            break

        if os.path.exists("tmp"):
            shutil.rmtree("tmp")

        print("Unzip : ", zip_filename)
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall("tmp")
        print("Process ..")

        # find the folder in tmp
        items = os.listdir("tmp")
        for folder in items :
            if not os.path.isdir(os.path.join("tmp", folder)) or \
                folder.startswith("__") :
                continue
            
            # find all the folder in this folder
            sub_folders = os.listdir(os.path.join("tmp", folder))
            for sub_folder in sub_folders :
                if not os.path.isdir(os.path.join("tmp", folder, sub_folder)) or \
                    sub_folder.startswith(".") : 
                    continue
                
                sub_item = os.listdir(os.path.join("tmp", folder, sub_folder))
                for leaf in sub_item :
                    # move the csv inside to data
                    from_path = os.path.join("tmp", folder, sub_folder, leaf)
                    dest_path = os.path.join("data", leaf)
                    if os.path.exists(dest_path) : 
                        os.remove(dest_path)

                    shutil.move(from_path, "data")

    if os.path.exists("tmp"):
        shutil.rmtree("tmp")

To optimize disk usage, we could have unziped one file at a time and convert its content instantaneously to `.parquet`.

## Convert the Data

### Raw Analysis

In [77]:
csv_reader = spark.read.option("header", "true") \
            .option("inferSchema", "true").csv

In [78]:
def find_all_csv():
    all_csv = []
    for item in os.listdir("data"):
        if not item.endswith(".csv") :
            continue
        all_csv.append(item)
    return sorted(all_csv)

all_csv = find_all_csv()

if False: # check column_names.txt
    for item in all_csv:    
        df = csv_reader(os.path.join("data", item))
        print(f"item {item} : {df.columns}")

By looking at the previous code output *(cached in `column_names.txt`)*, \
we notice the following columns between 2014-01 $\to$ 2021-01 (included) :
- `['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', `\
`'start station latitude', 'start station longitude', 'end station id', 'end station name', `\
`'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']`
    - The naming convention is not exactly the same between : `201610-citibike-tripdata_1.csv` $\to$ `201703-citibike-tripdata.csv_1.csv` : \
    `['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID',` \
    `'Start Station Name', 'Start Station Latitude', 'Start Station Longitude',` \
    `'End Station ID', 'End Station Name', 'End Station Latitude',` \
    `'End Station Longitude', 'Bike ID', 'User Type', 'Birth Year', 'Gender']`

The columns change between 2021-02 $\to$ 2023-12 (included) :
- `['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', `\
`'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', `\
`'start_lng', 'end_lat', 'end_lng', 'member_casual']`

We decide the following matching (**O.** as before 2021-02, **N.** as after 2021-02):

- `O.'tripduration'` as function : `stoptime` - `startime`
    - Renamed as `trip_duration`

- `N.'started_at'` $\leftarrow$ `O.'starttime'`
- `N.'ended_at'` $\leftarrow$ `O.'stoptime'` 
- `N.'start_station_id'` $\leftarrow$ `O.'start station id'`
    - type : string
- `N.'start_station_name'` $\leftarrow$ `O.'start station name'`
- `N.'start_lat'` $\leftarrow$ `O.'start station latitude'`
- `N.'start_lng'` $\leftarrow$ `O.'start station longitude'`
- `N.'end_station_id'` $\leftarrow$ `O.'end station id'`
    - type : string
- `N.'end_station_name'` $\leftarrow$ `O.'end station name'`
- `N.'end_lat'` $\leftarrow$ `O.'end station latitude'`
- `N.'end_lng'` $\leftarrow$ `O.'end station longitude'`
`
- `N.'ride_id'` $\leftarrow$ `O.'bikeid'` (format is not the same)
    - Both type of ID can registered as string

- `N.'member_casual'` $\leftarrow$ `O.'usertype'` (format is not the same)
    - Mapping : `O.Subscriber`, `O.Customer` $\to$ `N.member`, `N.casual`

- `O.'birth year` : (None) for elements of N
    - Renamed as `birth_year`
- `O.'gender'` : (None) for elements of N
- `N.'rideable_type'` : (None) for elements of O

- We will also add a binary column `old_format` to indicate if the data comes from `O` or `N` as defined above.

In [79]:
col_mapping_1 = {
    'tripduration': 'trip_duration',
    'usertype': 'member_casual',
    'birth year': 'birth_year',

    'starttime': 'started_at',
    'stoptime': 'ended_at',
    'start station id': 'start_station_id',
    'start station name': 'start_station_name',
    'start station latitude': 'start_lat',
    'start station longitude': 'start_lng',
    'end station id': 'end_station_id',
    'end station name': 'end_station_name',
    'end station latitude': 'end_lat',
    'end station longitude': 'end_lng',
    'bikeid': 'ride_id',
}

col_mapping_2 = {
    'Trip Duration': 'tripduration',
    'Start Time': 'starttime',
    'Stop Time': 'stoptime',
    
    'Start Station ID': 'start station id',
    'Start Station Name': 'start station name',
    'Start Station Latitude': 'start station latitude',
    'Start Station Longitude': 'start station longitude',

    'End Station ID': 'end station id',
    'End Station Name' : 'end station name',
    'End Station Latitude' : 'end station latitude',
    'End Station Longitude' : 'end station longitude',
    
    'Bike ID' : 'bikeid',
    'User Type' : 'usertype',
    'Birth Year' : 'birth year',
    'Gender' : 'gender'
}


In [80]:
def check_unique_values(df, column):
    return df.select(column).dropDuplicates().rdd.map(lambda row: row[0]).collect()

In [81]:
def fast_check():
    df_o = spark.read.csv(os.path.join("data", all_csv[0]), header=True, inferSchema=True)
    df_o = df_o.select(
        [col(old_col).alias(col_mapping_1.get(old_col, old_col)) for old_col in df_o.columns]
        )

    df_n = spark.read.csv(os.path.join("data", all_csv[-1]), header=True, inferSchema=True)
    
    print(check_unique_values(df_o, "member_casual"))
    print(check_unique_values(df_o, "gender"))
    print(check_unique_values(df_n, "rideable_type"))

    df_o.printSchema()
    df_n.printSchema()
    
    # df.show()

fast_check()

                                                                                

['Subscriber', 'Customer']
[1, 2, 0]
['electric_bike', 'classic_bike']
root
 |-- trip_duration: integer (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- start_lng: double (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_lat: double (nullable = true)
 |-- end_lng: double (nullable = true)
 |-- ride_id: integer (nullable = true)
 |-- member_casual: string (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- gender: integer (nullable = true)

root
 |-- ride_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: string (nullable = 

### File format Selection : Parquet

<img src="resources/file-formats.png" alt="Drawing" style="width: 400px;"/>

Our [course](https://stephane-v-boucheron.fr/slides/tbd/slides06_file-formats.html#/title-slide) on Big Data file formats.

Parquet suits our needs for the project:

- Suitable for laptop execution
- 37GB dataset size manageable with Parquet's compression
- Supports splitability for processing subsets of data on minimal configuration
- Compatible with Apache Spark

### CSV $\to$ Parquet

In [82]:
if not os.path.exists(os.path.join('computed')) :
    os.makedirs('computed')

In [83]:
factorial_columns = ["member_casual", "gender", "rideable_type"]

In [84]:
# Lower Case O. part

# add missing columns
def csv_o_process(df):
    df = df.select(
        [col(old_col).alias(col_mapping_1.get(old_col, old_col)) for old_col in df.columns])

    df = df.withColumn("birth_year", 
                       when(col("birth_year") == r"\N", None)
                       .otherwise(col("birth_year"))
                       .cast("integer")
                       )

    df = df.withColumn("start_station_id", col("start_station_id").cast("string"))\
            .withColumn("end_station_id", col("end_station_id").cast("string")) \
            .withColumn("ended_at", col("ended_at").cast("timestamp")) \
            .withColumn("started_at", col("started_at").cast("timestamp")) \
            .withColumn("ride_id", col("ride_id").cast("string"))

    df = df.withColumn("member_casual",
                        when(col("member_casual") == "Subscriber", lit("member")) \
                        .otherwise(lit("casual")))
    
    df = df.withColumn("rideable_type", lit(None).cast('string'))
    df = df.withColumn("old_format", lit(True))
    
    df = df.select(*sorted(df.columns))

    # df.write.mode("append").parquet(parquet_file)
    return df

def csv_to_parquet_part_1(csv_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)
    return csv_o_process(df)

In [85]:
# Upper case O. part

def csv_to_parquet_part_2(csv_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)
    
    df = df.select(
        [col(old_col).alias(col_mapping_2.get(old_col, old_col)) for old_col in df.columns])
    
    return csv_o_process(df)

In [86]:
# N. Part

def csv_to_parquet_part_3(csv_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)

    df = df.withColumn("trip_duration", lit(None).cast("integer")) # TODO
    df = df.withColumn("old_format", lit(False))
    df = df.withColumn("birth_year", lit(None).cast("integer"))
    df = df.withColumn("gender", lit(0)) # TODO verify the default value
    df = df.withColumn("old_format", lit(False))

    df = df.withColumn("start_station_id", col("start_station_id").cast("string"))\
            .withColumn("end_station_id", col("end_station_id").cast("string")) \
            .withColumn("ended_at", col("ended_at").cast("timestamp")) \
            .withColumn("started_at", col("started_at").cast("timestamp")) \
            .withColumn("ride_id", col("ride_id").cast("string"))

    df = df.select(*sorted(df.columns))

    # df.write.mode("append").parquet(parquet_file)
    return df


In [87]:
# Scheduler

schedule = [
    ("201401-citibike-tripdata_1.csv", 1),
    ("201610-citibike-tripdata_1.csv", 2),
    ("201704-citibike-tripdata.csv_1.csv", 1),
    ("202102-citibike-tripdata_1.csv", 3)
]

# The file size are always below 180M
# We can safely load simultaneously 20 files from memory to .parquet
CSV_PER_PARQUET = 10

def reduce_df_array(df_array):
    # if not df_array:
    #     raise ValueError("Empty array.")

    # schema = df_array[0].schema
    # for df in df_array:
    #     if df.schema != schema:
    #         raise ValueError("Incompatibles schema.")

    return reduce(lambda df1, df2: df1.union(df2), df_array)

def process_csv_files(csv_files, schedule, parquet_dir="computed", max_size_gb=1, start_with=None):
    schedule_pointer = 0
    parquet_index = 0

    df_buffer = []

    # ignore = start_with is not None
    for index, csv_file in enumerate(csv_files):
        # if ignore:
        #     if csv_file != start_with : continue
        #     ignore = False
        #     parquet_index = index // CSV_PER_PARQUET

        csv_path = os.path.join("data", csv_file)
        
        print(f"[] {csv_file}")
        if (schedule_pointer < len(schedule) - 1) and (csv_file == schedule[schedule_pointer + 1][0]) :
            schedule_pointer += 1
            print(f"{csv_file} | {schedule_pointer} : {schedule[schedule_pointer]}")

        schedule_mode = schedule[schedule_pointer][1]
        target_function = [csv_to_parquet_part_1, csv_to_parquet_part_2, csv_to_parquet_part_3][schedule_mode - 1]

        df_buffer.append(target_function(csv_path))

        if (index + 1) % CSV_PER_PARQUET == 0 :
            parquet_path = os.path.join(
                parquet_dir, 'precompiled', f'part_{parquet_index:04d}.parquet')
            print(f"Writing ({index}/{len(csv_files)}) : {parquet_dir}")
            print(f"From {csv_files[index-CSV_PER_PARQUET+1]} to {csv_files[index]}")

            df = reduce_df_array(df_buffer)
            df_buffer = []
            parquet_index += 1

            df.write.mode("overwrite").parquet(parquet_path)

if ENABLE_COMPUTE_ZIP_TO_PARQUET:
    process_csv_files(all_csv, schedule) #, start_with="202011-citibike-tripdata_2.csv")

[] 201401-citibike-tripdata_1.csv


                                                                                

[] 201402-citibike-tripdata_1.csv
[] 201403-citibike-tripdata_1.csv


                                                                                

[] 201404-citibike-tripdata_1.csv


                                                                                

[] 201405-citibike-tripdata_1.csv


                                                                                

[] 201406-citibike-tripdata_1.csv


                                                                                

[] 201407-citibike-tripdata_1.csv


                                                                                

[] 201408-citibike-tripdata_1.csv


                                                                                

[] 201409-citibike-tripdata_1.csv


                                                                                

[] 201410-citibike-tripdata_1.csv


                                                                                

Writing (9/256) : computed
From 201401-citibike-tripdata_1.csv to 201410-citibike-tripdata_1.csv


                                                                                

[] 201411-citibike-tripdata_1.csv


                                                                                

[] 201412-citibike-tripdata_1.csv


                                                                                

[] 201501-citibike-tripdata_1.csv
[] 201502-citibike-tripdata_1.csv
[] 201503-citibike-tripdata_1.csv
[] 201504-citibike-tripdata_1.csv


                                                                                

[] 201505-citibike-tripdata_1.csv


                                                                                

[] 201506-citibike-tripdata_1.csv


                                                                                

[] 201507-citibike-tripdata_1.csv


                                                                                

[] 201507-citibike-tripdata_2.csv
Writing (19/256) : computed
From 201411-citibike-tripdata_1.csv to 201507-citibike-tripdata_2.csv


                                                                                

[] 201508-citibike-tripdata_1.csv


                                                                                

[] 201508-citibike-tripdata_2.csv
[] 201509-citibike-tripdata_1.csv


                                                                                

[] 201509-citibike-tripdata_2.csv


                                                                                

[] 201510-citibike-tripdata_1.csv


                                                                                

[] 201510-citibike-tripdata_2.csv
[] 201511-citibike-tripdata_1.csv


                                                                                

[] 201512-citibike-tripdata_1.csv


                                                                                

[] 201601-citibike-tripdata_1.csv


                                                                                

[] 201602-citibike-tripdata_1.csv


                                                                                

Writing (29/256) : computed
From 201508-citibike-tripdata_1.csv to 201602-citibike-tripdata_1.csv


                                                                                

[] 201603-citibike-tripdata_1.csv


                                                                                

[] 201604-citibike-tripdata_1.csv


                                                                                

[] 201604-citibike-tripdata_2.csv
[] 201605-citibike-tripdata_1.csv


                                                                                

[] 201605-citibike-tripdata_2.csv
[] 201606-citibike-tripdata_1.csv


                                                                                

[] 201606-citibike-tripdata_2.csv


                                                                                

[] 201607-citibike-tripdata_1.csv


                                                                                

[] 201607-citibike-tripdata_2.csv
[] 201608-citibike-tripdata_1.csv


                                                                                

Writing (39/256) : computed
From 201603-citibike-tripdata_1.csv to 201608-citibike-tripdata_1.csv


                                                                                

[] 201608-citibike-tripdata_2.csv


                                                                                

[] 201609-citibike-tripdata_1.csv


                                                                                

[] 201609-citibike-tripdata_2.csv


                                                                                

[] 201610-citibike-tripdata_1.csv
201610-citibike-tripdata_1.csv | 1 : ('201610-citibike-tripdata_1.csv', 2)


                                                                                

[] 201610-citibike-tripdata_2.csv


                                                                                

[] 201611-citibike-tripdata_1.csv


                                                                                

[] 201611-citibike-tripdata_2.csv
[] 201612-citibike-tripdata_1.csv


                                                                                

[] 201701-citibike-tripdata.csv_1.csv


                                                                                

[] 201702-citibike-tripdata.csv_1.csv


                                                                                

Writing (49/256) : computed
From 201608-citibike-tripdata_2.csv to 201702-citibike-tripdata.csv_1.csv


                                                                                

[] 201703-citibike-tripdata.csv_1.csv


                                                                                

[] 201704-citibike-tripdata.csv_1.csv
201704-citibike-tripdata.csv_1.csv | 2 : ('201704-citibike-tripdata.csv_1.csv', 1)


                                                                                

[] 201704-citibike-tripdata.csv_2.csv


                                                                                

[] 201705-citibike-tripdata.csv_1.csv


                                                                                

[] 201705-citibike-tripdata.csv_2.csv


                                                                                

[] 201706-citibike-tripdata.csv_1.csv


                                                                                

[] 201706-citibike-tripdata.csv_2.csv


                                                                                

[] 201707-citibike-tripdata.csv_1.csv


                                                                                

[] 201707-citibike-tripdata.csv_2.csv


                                                                                

[] 201708-citibike-tripdata.csv_1.csv


                                                                                

Writing (59/256) : computed
From 201703-citibike-tripdata.csv_1.csv to 201708-citibike-tripdata.csv_1.csv


                                                                                

[] 201708-citibike-tripdata.csv_2.csv


                                                                                

[] 201709-citibike-tripdata.csv_1.csv


                                                                                

[] 201709-citibike-tripdata.csv_2.csv


                                                                                

[] 201710-citibike-tripdata.csv_1.csv


                                                                                

[] 201710-citibike-tripdata.csv_2.csv


                                                                                

[] 201711-citibike-tripdata.csv_1.csv


                                                                                

[] 201711-citibike-tripdata.csv_2.csv


                                                                                

[] 201712-citibike-tripdata.csv_1.csv


                                                                                

[] 201801-citibike-tripdata_1.csv


                                                                                

[] 201802-citibike-tripdata_1.csv


                                                                                

Writing (69/256) : computed
From 201708-citibike-tripdata.csv_2.csv to 201802-citibike-tripdata_1.csv


                                                                                

[] 201803-citibike-tripdata_1.csv


                                                                                

[] 201804-citibike-tripdata_1.csv


                                                                                

[] 201804-citibike-tripdata_2.csv


                                                                                

[] 201805-citibike-tripdata_1.csv


                                                                                

[] 201805-citibike-tripdata_2.csv


                                                                                

[] 201806-citibike-tripdata_1.csv


                                                                                

[] 201806-citibike-tripdata_2.csv


                                                                                

[] 201807-citibike-tripdata_1.csv


                                                                                

[] 201807-citibike-tripdata_2.csv


                                                                                

[] 201808-citibike-tripdata_1.csv


                                                                                

Writing (79/256) : computed
From 201803-citibike-tripdata_1.csv to 201808-citibike-tripdata_1.csv


                                                                                

[] 201808-citibike-tripdata_2.csv


                                                                                

[] 201809-citibike-tripdata_1.csv


                                                                                

[] 201809-citibike-tripdata_2.csv


                                                                                

[] 201810-citibike-tripdata_1.csv


                                                                                

[] 201810-citibike-tripdata_2.csv


                                                                                

[] 201811-citibike-tripdata_1.csv


                                                                                

[] 201811-citibike-tripdata_2.csv
[] 201812-citibike-tripdata_1.csv


                                                                                

[] 201812-citibike-tripdata_2.csv
[] 201901-citibike-tripdata_1.csv


                                                                                

Writing (89/256) : computed
From 201808-citibike-tripdata_2.csv to 201901-citibike-tripdata_1.csv


                                                                                

[] 201902-citibike-tripdata_1.csv


                                                                                

[] 201903-citibike-tripdata_1.csv


                                                                                

[] 201903-citibike-tripdata_2.csv


                                                                                

[] 201904-citibike-tripdata_1.csv


                                                                                

[] 201904-citibike-tripdata_2.csv


                                                                                

[] 201905-citibike-tripdata_1.csv


                                                                                

[] 201905-citibike-tripdata_2.csv


                                                                                

[] 201906-citibike-tripdata_1.csv


                                                                                

[] 201906-citibike-tripdata_2.csv


                                                                                

[] 201906-citibike-tripdata_3.csv
Writing (99/256) : computed
From 201902-citibike-tripdata_1.csv to 201906-citibike-tripdata_3.csv


                                                                                

[] 201907-citibike-tripdata_1.csv


                                                                                

[] 201907-citibike-tripdata_2.csv


                                                                                

[] 201907-citibike-tripdata_3.csv
[] 201908-citibike-tripdata_1.csv


                                                                                

[] 201908-citibike-tripdata_2.csv


                                                                                

[] 201908-citibike-tripdata_3.csv


                                                                                

[] 201909-citibike-tripdata_1.csv


                                                                                

[] 201909-citibike-tripdata_2.csv


                                                                                

[] 201909-citibike-tripdata_3.csv


                                                                                

[] 201910-citibike-tripdata_1.csv


                                                                                

Writing (109/256) : computed
From 201907-citibike-tripdata_1.csv to 201910-citibike-tripdata_1.csv


                                                                                

[] 201910-citibike-tripdata_2.csv


                                                                                

[] 201910-citibike-tripdata_3.csv
[] 201911-citibike-tripdata_1.csv


                                                                                

[] 201911-citibike-tripdata_2.csv


                                                                                

[] 201912-citibike-tripdata_1.csv


                                                                                

[] 202001-citibike-tripdata_1.csv


                                                                                

[] 202001-citibike-tripdata_2.csv


                                                                                

[] 202002-citibike-tripdata_1.csv


                                                                                

[] 202002-citibike-tripdata_2.csv
[] 202003-citibike-tripdata_1.csv


                                                                                

Writing (119/256) : computed
From 201910-citibike-tripdata_2.csv to 202003-citibike-tripdata_1.csv


                                                                                

[] 202003-citibike-tripdata_2.csv
[] 202004-citibike-tripdata_1.csv


                                                                                

[] 202005-citibike-tripdata_1.csv


                                                                                

[] 202005-citibike-tripdata_2.csv


                                                                                

[] 202006-citibike-tripdata_1.csv


                                                                                

[] 202006-citibike-tripdata_2.csv


                                                                                

[] 202007-citibike-tripdata_1.csv


                                                                                

[] 202007-citibike-tripdata_2.csv


                                                                                

[] 202007-citibike-tripdata_3.csv
[] 202008-citibike-tripdata_1.csv


                                                                                

Writing (129/256) : computed
From 202003-citibike-tripdata_2.csv to 202008-citibike-tripdata_1.csv


                                                                                

[] 202008-citibike-tripdata_2.csv


                                                                                

[] 202008-citibike-tripdata_3.csv


                                                                                

[] 202009-citibike-tripdata_1.csv


                                                                                

[] 202009-citibike-tripdata_2.csv


                                                                                

[] 202009-citibike-tripdata_3.csv


                                                                                

[] 202010-citibike-tripdata_1.csv


                                                                                

[] 202010-citibike-tripdata_2.csv


                                                                                

[] 202010-citibike-tripdata_3.csv


                                                                                

[] 202011-citibike-tripdata_1.csv


                                                                                

[] 202011-citibike-tripdata_2.csv


                                                                                

Writing (139/256) : computed
From 202008-citibike-tripdata_2.csv to 202011-citibike-tripdata_2.csv


                                                                                

[] 202012-citibike-tripdata_1.csv


                                                                                

[] 202012-citibike-tripdata_2.csv
[] 202101-citibike-tripdata_1.csv


                                                                                

[] 202101-citibike-tripdata_2.csv
[] 202102-citibike-tripdata_1.csv
202102-citibike-tripdata_1.csv | 3 : ('202102-citibike-tripdata_1.csv', 3)


                                                                                

[] 202103-citibike-tripdata_1.csv


                                                                                

[] 202103-citibike-tripdata_2.csv


                                                                                

[] 202104-citibike-tripdata_1.csv


                                                                                

[] 202104-citibike-tripdata_2.csv


                                                                                

[] 202104-citibike-tripdata_3.csv
Writing (149/256) : computed
From 202012-citibike-tripdata_1.csv to 202104-citibike-tripdata_3.csv


                                                                                

[] 202105-citibike-tripdata_1.csv


                                                                                

[] 202105-citibike-tripdata_2.csv


                                                                                

[] 202105-citibike-tripdata_3.csv


                                                                                

[] 202106-citibike-tripdata_1.csv


                                                                                

[] 202106-citibike-tripdata_2.csv


                                                                                

[] 202106-citibike-tripdata_3.csv


                                                                                

[] 202106-citibike-tripdata_4.csv
[] 202107-citibike-tripdata_1.csv


                                                                                

[] 202107-citibike-tripdata_2.csv


                                                                                

[] 202107-citibike-tripdata_3.csv


                                                                                

Writing (159/256) : computed
From 202105-citibike-tripdata_1.csv to 202107-citibike-tripdata_3.csv


                                                                                

[] 202107-citibike-tripdata_4.csv
[] 202108-citibike-tripdata_1.csv


                                                                                

[] 202108-citibike-tripdata_2.csv


                                                                                

[] 202108-citibike-tripdata_3.csv


                                                                                

[] 202108-citibike-tripdata_4.csv
[] 202109-citibike-tripdata_1.csv


                                                                                

[] 202109-citibike-tripdata_2.csv


                                                                                

[] 202109-citibike-tripdata_3.csv


                                                                                

[] 202109-citibike-tripdata_4.csv
[] 202110-citibike-tripdata_1.csv


                                                                                

Writing (169/256) : computed
From 202107-citibike-tripdata_4.csv to 202110-citibike-tripdata_1.csv


                                                                                

[] 202110-citibike-tripdata_2.csv


                                                                                

[] 202110-citibike-tripdata_3.csv


                                                                                

[] 202110-citibike-tripdata_4.csv
[] 202111-citibike-tripdata_1.csv


                                                                                

[] 202111-citibike-tripdata_2.csv


                                                                                

[] 202111-citibike-tripdata_3.csv
[] 202112-citibike-tripdata_1.csv


                                                                                

[] 202112-citibike-tripdata_2.csv


                                                                                

[] 202201-citibike-tripdata_1.csv


                                                                                

[] 202201-citibike-tripdata_2.csv
Writing (179/256) : computed
From 202110-citibike-tripdata_2.csv to 202201-citibike-tripdata_2.csv


                                                                                

[] 202202-citibike-tripdata_1.csv


                                                                                

[] 202202-citibike-tripdata_2.csv


                                                                                

[] 202203-citibike-tripdata_1.csv


                                                                                

[] 202203-citibike-tripdata_2.csv


                                                                                

[] 202204-citibike-tripdata_1.csv


                                                                                

[] 202204-citibike-tripdata_2.csv


                                                                                

[] 202204-citibike-tripdata_3.csv


                                                                                

[] 202205-citibike-tripdata_1.csv


                                                                                

[] 202205-citibike-tripdata_2.csv


                                                                                

[] 202205-citibike-tripdata_3.csv


                                                                                

Writing (189/256) : computed
From 202202-citibike-tripdata_1.csv to 202205-citibike-tripdata_3.csv


                                                                                

[] 202206-citbike-tripdata_1.csv


                                                                                

[] 202206-citbike-tripdata_2.csv


                                                                                

[] 202206-citbike-tripdata_3.csv


                                                                                

[] 202206-citbike-tripdata_4.csv


                                                                                

[] 202207-citbike-tripdata_1.csv


                                                                                

[] 202207-citbike-tripdata_2.csv


                                                                                

[] 202207-citbike-tripdata_3.csv


                                                                                

[] 202207-citbike-tripdata_4.csv


                                                                                

[] 202208-citibike-tripdata_1.csv


                                                                                

[] 202208-citibike-tripdata_2.csv


                                                                                

Writing (199/256) : computed
From 202206-citbike-tripdata_1.csv to 202208-citibike-tripdata_2.csv


                                                                                

[] 202208-citibike-tripdata_3.csv


                                                                                

[] 202208-citibike-tripdata_4.csv


                                                                                

[] 202209-citibike-tripdata_1.csv


                                                                                

[] 202209-citibike-tripdata_2.csv


                                                                                

[] 202209-citibike-tripdata_3.csv


                                                                                

[] 202209-citibike-tripdata_4.csv


                                                                                

[] 202210-citibike-tripdata_1.csv


                                                                                

[] 202210-citibike-tripdata_2.csv


                                                                                

[] 202210-citibike-tripdata_3.csv


                                                                                

[] 202210-citibike-tripdata_4.csv
Writing (209/256) : computed
From 202208-citibike-tripdata_3.csv to 202210-citibike-tripdata_4.csv


                                                                                

[] 202211-citibike-tripdata_1.csv


                                                                                

[] 202211-citibike-tripdata_2.csv


                                                                                

[] 202211-citibike-tripdata_3.csv


                                                                                

[] 202212-citibike-tripdata_1.csv


                                                                                

[] 202212-citibike-tripdata_2.csv


                                                                                

[] 202301-citibike-tripdata_1.csv


                                                                                

[] 202301-citibike-tripdata_2.csv


                                                                                

[] 202302-citibike-tripdata_1.csv


                                                                                

[] 202302-citibike-tripdata_2.csv


                                                                                

[] 202303-citibike-tripdata_1.csv


                                                                                

Writing (219/256) : computed
From 202211-citibike-tripdata_1.csv to 202303-citibike-tripdata_1.csv


                                                                                

[] 202303-citibike-tripdata_2.csv


                                                                                

[] 202303-citibike-tripdata_3.csv
[] 202304-citibike-tripdata_1.csv


                                                                                

[] 202304-citibike-tripdata_2.csv


                                                                                

[] 202304-citibike-tripdata_3.csv


                                                                                

[] 202305-citibike-tripdata_1.csv


                                                                                

[] 202305-citibike-tripdata_2.csv


                                                                                

[] 202305-citibike-tripdata_3.csv


                                                                                

[] 202305-citibike-tripdata_4.csv


                                                                                

[] 202306-citibike-tripdata_1.csv


                                                                                

Writing (229/256) : computed
From 202303-citibike-tripdata_2.csv to 202306-citibike-tripdata_1.csv


                                                                                

[] 202306-citibike-tripdata_2.csv


                                                                                

[] 202306-citibike-tripdata_3.csv


                                                                                

[] 202306-citibike-tripdata_4.csv


                                                                                

[] 202307-citibike-tripdata_1.csv


                                                                                

[] 202307-citibike-tripdata_2.csv


                                                                                

[] 202307-citibike-tripdata_3.csv


                                                                                

[] 202307-citibike-tripdata_4.csv


                                                                                

[] 202308-citibike-tripdata_1.csv


                                                                                

[] 202308-citibike-tripdata_2.csv


                                                                                

[] 202308-citibike-tripdata_3.csv


                                                                                

Writing (239/256) : computed
From 202306-citibike-tripdata_2.csv to 202308-citibike-tripdata_3.csv


                                                                                

[] 202308-citibike-tripdata_4.csv


                                                                                

[] 202308-citibike-tripdata_5.csv
[] 202309-citibike-tripdata_1.csv


                                                                                

[] 202309-citibike-tripdata_2.csv


                                                                                

[] 202309-citibike-tripdata_3.csv


                                                                                

[] 202309-citibike-tripdata_4.csv


                                                                                

[] 202310-citibike-tripdata_1.csv


                                                                                

[] 202310-citibike-tripdata_2.csv


                                                                                

[] 202310-citibike-tripdata_3.csv


                                                                                

[] 202310-citibike-tripdata_4.csv


                                                                                

Writing (249/256) : computed
From 202308-citibike-tripdata_4.csv to 202310-citibike-tripdata_4.csv


                                                                                

[] 202311-citibike-tripdata_1.csv


                                                                                

[] 202311-citibike-tripdata_2.csv


                                                                                

[] 202311-citibike-tripdata_3.csv


                                                                                

[] 202312-citibike-tripdata_1.csv


                                                                                

[] 202312-citibike-tripdata_2.csv


                                                                                

[] 202312-citibike-tripdata_3.csv


                                                                                

### Verify Parquet is working

In [89]:
# df = spark.read.parquet(os.path.join("computed", "precompiled", "part_0023.parquet"))
df = spark.read.parquet(os.path.join("computed", "precompiled", "*.parquet"))
print(df.count())
df.show()

                                                                                

195252960
+----------+-----------------+------------------+--------------+--------------------+-------------------+------+-------------+----------+----------------+-------------+-----------------+------------------+----------------+--------------------+-------------------+-------------+
|birth_year|          end_lat|           end_lng|end_station_id|    end_station_name|           ended_at|gender|member_casual|old_format|         ride_id|rideable_type|        start_lat|         start_lng|start_station_id|  start_station_name|         started_at|trip_duration|
+----------+-----------------+------------------+--------------+--------------------+-------------------+------+-------------+----------+----------------+-------------+-----------------+------------------+----------------+--------------------+-------------------+-------------+
|      NULL|40.72970805644994| -73.9865979552269|       5746.02|     E 10 St & 2 Ave|2023-09-03 10:24:16|     0|       member|     false|B0A0F1DEFA4B72FC|el

## Analysis

## Monitoring

## Spatial Informations