# University Project II - Big Data

Author : Ophiase

In [None]:
ENABLE_DOWNLOAD = False
ENABLE_UNZIP = False

### Dependencies

In [None]:
import shutil
import os
import requests
import pandas as pd
import re
import pyspark.sql.functions as F
import zipfile

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import matplotlib.pyplot as plt
import pyspark
import logging

logging.getLogger("pyspark").setLevel(logging.ERROR)

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import when, mean, stddev, skewness, kurtosis, expr, date_format
import pyspark.sql.functions as pf
from pyspark.sql.functions import when, col, lit


import altair as alt
import plotly
import plotly.express as px

import scipy
from scipy.stats import skew, kurtosis

import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
spark = SparkSession.builder.getOrCreate()

## Download the DATA

In [None]:
# from 2014 to 2023
trip_urls = [
    (year, f"https://s3.amazonaws.com/tripdata/{year}-citibike-tripdata.zip")
    for year in range(2014, 2023 + 1)
]

if not os.path.exists(os.path.join('data')) :
    os.makedirs('data')

zip_files = []

for year, url in trip_urls:
    basename = os.path.join('data', str(year) + "_" + 'citibike_tripdata')
    zip_filename = basename + ".zip"
    csv_filename = basename + ".csv"
    zip_files.append((year, zip_filename, csv_filename))

    if not ENABLE_DOWNLOAD : continue
    print(f'Check {basename} ...')

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    progress = 0

    if not os.path.exists(zip_filename) and not os.path.exists(csv_filename) :
        with open(zip_filename, 'wb') as f:
            for data in response.iter_content(block_size):
                if data:
                    f.write(data)
                    progress += len(data)
                    print(f'\rDownloaded {progress}/{total_size} bytes', end='')

        print(f'\nDownload complete: {zip_filename}')

print("Finished")


In [None]:
if ENABLE_UNZIP :
    for (year, zip_filename, csv_filename) in zip_files:
        # if year < 2018: continue # WARNING : DISABLE THIS LINE

        if not zipfile.is_zipfile(zip_filename):
            print("Corrupted zip file.")
            break

        if os.path.exists("tmp"):
            shutil.rmtree("tmp")

        print("Unzip : ", zip_filename)
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall("tmp")
        print("Process ..")

        # find the folder in tmp
        items = os.listdir("tmp")
        for folder in items :
            if not os.path.isdir(os.path.join("tmp", folder)) or \
                folder.startswith("__") :
                continue
            
            # find all the folder in this folder
            sub_folders = os.listdir(os.path.join("tmp", folder))
            for sub_folder in sub_folders :
                if not os.path.isdir(os.path.join("tmp", folder, sub_folder)) or \
                    sub_folder.startswith(".") : 
                    continue
                
                sub_item = os.listdir(os.path.join("tmp", folder, sub_folder))
                for leaf in sub_item :
                    # move the csv inside to data
                    from_path = os.path.join("tmp", folder, sub_folder, leaf)
                    dest_path = os.path.join("data", leaf)
                    if os.path.exists(dest_path) : 
                        os.remove(dest_path)

                    shutil.move(from_path, "data")

    if os.path.exists("tmp"):
        shutil.rmtree("tmp")

## Convert the Data

### Raw Analysis

In [None]:
csv_reader = spark.read.option("header", "true") \
            .option("inferSchema", "true").csv

In [None]:
def find_all_csv():
    all_csv = []
    for item in os.listdir("data"):
        if not item.endswith(".csv") :
            continue
        all_csv.append(item)
    return sorted(all_csv)

all_csv = find_all_csv()

if False: # check column_names.txt
    for item in all_csv:    
        df = csv_reader(os.path.join("data", item))
        print(f"item {item} : {df.columns}")

By looking at the previous code output *(cached in `column_names.txt`)*, \
we notice the following columns between 2014-01 $\to$ 2021-01 (included) :
- `['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', `\
`'start station latitude', 'start station longitude', 'end station id', 'end station name', `\
`'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']`
    - The naming convention is not exactly the same between : `201610-citibike-tripdata_1.csv` $\to$ `201703-citibike-tripdata.csv_1.csv` : \
    `['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID',` \
    `'Start Station Name', 'Start Station Latitude', 'Start Station Longitude',` \
    `'End Station ID', 'End Station Name', 'End Station Latitude',` \
    `'End Station Longitude', 'Bike ID', 'User Type', 'Birth Year', 'Gender']`

The columns change between 2021-02 $\to$ 2023-12 (included) :
- `['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', `\
`'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', `\
`'start_lng', 'end_lat', 'end_lng', 'member_casual']`

We decide the following matching (**O.** as before 2021-02, **N.** as after 2021-02):

- `O.'tripduration'` as function : `stoptime` - `startime`
    - Renamed as `trip_duration`

- `N.'started_at'` $\leftarrow$ `O.'starttime'`
- `N.'ended_at'` $\leftarrow$ `O.'stoptime'` 
- `N.'start_station_id'` $\leftarrow$ `O.'start station id'`
- `N.'start_station_name'` $\leftarrow$ `O.'start station name'`
- `N.'start_lat'` $\leftarrow$ `O.'start station latitude'`
- `N.'start_lng'` $\leftarrow$ `O.'start station longitude'`
- `N.'end_station_id'` $\leftarrow$ `O.'end station id'`
- `N.'end_station_name'` $\leftarrow$ `O.'end station name'`
- `N.'end_lat'` $\leftarrow$ `O.'end station latitude'`
- `N.'end_lng'` $\leftarrow$ `O.'end station longitude'`
`
- `N.'ride_id'` $\leftarrow$ `O.'bikeid'` (format is not the same)
    - Both type of ID can registered as string

- `N.'member_casual'` $\leftarrow$ `O.'usertype'` (format is not the same)
    - Mapping : `O.Subscriber`, `O.Customer` $\to$ `N.member`, `N.casual`

- `O.'birth year` : (None) for elements of N
    - Renamed as `birth_year`
- `O.'gender'` : (None) for elements of N
- `N.'rideable_type'` : (None) for elements of O

- We will also add a binary column `old_format` to indicate if the data comes from `O` or `N` as defined above.

In [None]:
def check_unique_values(df, column):
    return df.select(column).dropDuplicates().rdd.map(lambda row: row[0]).collect()

In [None]:
def fast_check():
    df_o = spark.read.csv(os.path.join("data", all_csv[0]), header=True, inferSchema=True)
    df_o = df_o.select(
        [col(old_col).alias(col_mapping_1.get(old_col, old_col)) for old_col in df_o.columns]
        )

    df_n = spark.read.csv(os.path.join("data", all_csv[-1]), header=True, inferSchema=True)
    
    print(check_unique_values(df_o, "member_casual"))
    print(check_unique_values(df_o, "gender"))
    print(check_unique_values(df_n, "rideable_type"))
    # df.show()

fast_check()

### File format Selection : Parquet

<img src="resources/file-formats.png" alt="Drawing" style="width: 400px;"/>

Our [course](https://stephane-v-boucheron.fr/slides/tbd/slides06_file-formats.html#/title-slide) on Big Data file formats.

Parquet suits our needs for the project:

- Suitable for laptop execution
- 37GB dataset size manageable with Parquet's compression
- Supports splitability for processing subsets of data on minimal configuration
- Compatible with Apache Spark

### CSV $\to$ Parquet

In [None]:
if not os.path.exists(os.path.join('computed')) :
    os.makedirs('computed')

In [None]:
factorial_columns = ["member_casual", "gender", "rideable_type"]

In [None]:
# Lower Case O. part

col_mapping_1 = {
    'tripduration': 'trip_duration',
    'usertype': 'member_casual',
    'birth year': 'birth_year',

    'starttime': 'started_at',
    'stoptime': 'ended_at',
    'start station id': 'start_station_id',
    'start station name': 'start_station_name',
    'start station latitude': 'start_lat',
    'start station longitude': 'start_lgt',
    'end station id': 'end_station_id',
    'end station name': 'end_station_name',
    'end station latitude': 'end_lat',
    'end station longitude': 'end_lgt',
    'bikeid': 'ride_id',
}

# add missing columns
def csv_o_process(df, parquet_file):
    df = df.select(
        [col(old_col).alias(col_mapping_1.get(old_col, old_col)) for old_col in df.columns])

    df = df.withColumn("member_casual",
                        when(col("member_casual") == "Subscriber", lit("member")) \
                        .otherwise(lit("casual")))
    
    df = df.withColumn("rideable_type", lit(None))
    df = df.withColumn("old_format", lit(True))
    
    df = df.select(*sorted(df.columns))

    df.write.mode("append").parquet(parquet_file)

def csv_to_parquet_part_1(csv_file, parquet_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)
    csv_o_process(df, parquet_file)

In [None]:
# Upper case O. part

col_mapping_2 = {
    'Trip Duration': 'tripduration',
    'Start Time': 'starttime',
    'Stop Time': 'stoptime',
    
    'Start Station ID': 'start station id',
    'Start Station Name': 'start station name',
    'Start Station Latitude': 'start station latitude',
    'Start Station Longitude': 'start station longitude',

    'End Station ID': 'end station id',
    'End Station Name' : 'end station name',
    'End Station Latitude' : 'end station latitude',
    'End Station Longitude' : 'end station longitude',
    
    'Bike ID' : 'bikeid',
    'User Type' : 'usertype',
    'Birth Year' : 'birth year',
    'Gender' : 'gender'
}

def csv_to_parquet_part_2(csv_file, parquet_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)
    
    df = df.select(
        [col(old_col).alias(col_mapping_2.get(old_col, old_col)) for old_col in df.columns])
    
    csv_o_process(df, parquet_file)

In [None]:
# N. Part
def csv_to_parquet_part_3(csv_file, parquet_file) :
    df = spark.read.csv(csv_file, header=True, inferSchema=True)

    df = df.withColumn("trip_duration", lit(None)) # TODO
    df = df.withColumn("old_format", lit(False))
    df = df.withColumn("birth_year", lit(None))
    df = df.withColumn("gender", lit(0)) # TODO verify the default value
    df = df.withColumn("old_format", lit(False))

    df = df.select(*sorted(df.columns))

    df.write.mode("append").parquet(parquet_file)


In [None]:
# Scheduler

schedule = [
    ("201401-citibike-tripdata_1.csv", 1),
    ("201610-citibike-tripdata_1.csv", 2),
    ("201704-citibike-tripdata.csv_1.csv", 1),
    ("202102-citibike-tripdata_1.csv", 3)
]

def process_csv_files(csv_files, schedule, parquet_dir="computed", max_size_gb=2):
    # parquet_file = 0
    # total_size = 0

    schedule_pointer = 0
    for csv_file in csv_files:
        csv_path = os.path.join("data", csv_file)
        
        if csv_file == schedule[schedule_pointer + 1]:
            schedule_pointer += 1

        schedule_mode = schedule[schedule_pointer][1]
        target_function = [csv_to_parquet_part_1, csv_to_parquet_part_2, csv_to_parquet_part_3][schedule_mode - 1]

        # file_size = spark.sparkContext.textFile(csv_path).map(lambda x: len(x)).reduce(lambda a, b: a + b)
        # if total_size + file_size > max_size_gb * 1024 ** 3:
        #     parquet_file += 1
        #     total_size = 0
        # parquet_path = f"{parquet_dir}/part-{parquet_file:04d}.parquet"

        parquet_path = os.path.join(parquet_dir, 'citibike.parquet')
        
        target_function(csv_path, parquet_path)

        # total_size += file_size

process_csv_files(all_csv, schedule)

### Verify Parquet is working

In [None]:
df = spark.read.parquet(os.join("computed", "citibike.parquet"))

## Analysis

## Monitoring

## Spatial Informations