# Project I - Big Data

Author : Ophiase

In [None]:
ENABLE_DOWNLOAD = False
ENABLE_UNZIP = False

### Dependencies

In [None]:
import shutil
import os
import requests
import pandas as pd
import re
import pyspark.sql.functions as F
import zipfile

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import matplotlib.pyplot as plt
import pyspark
import logging

logging.getLogger("pyspark").setLevel(logging.ERROR)

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import when, mean, stddev, skewness, kurtosis, expr, date_format
import pyspark.sql.functions as pf

import altair as alt
import plotly
import plotly.express as px

import scipy
from scipy.stats import skew, kurtosis

import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
spark = SparkSession.builder.getOrCreate()

## Download the DATA

In [None]:
# from 2014 to 2023
trip_urls = [
    (year, f"https://s3.amazonaws.com/tripdata/{year}-citibike-tripdata.zip")
    for year in range(2014, 2023 + 1)
]

if not os.path.exists(os.path.join('data')) :
    os.makedirs('data')

zip_files = []

for year, url in trip_urls:
    basename = os.path.join('data', str(year) + "_" + 'citibike_tripdata')
    zip_filename = basename + ".zip"
    csv_filename = basename + ".csv"
    zip_files.append((year, zip_filename, csv_filename))

    if not ENABLE_DOWNLOAD : continue
    print(f'Check {basename} ...')

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    progress = 0

    if not os.path.exists(zip_filename) and not os.path.exists(csv_filename) :
        with open(zip_filename, 'wb') as f:
            for data in response.iter_content(block_size):
                if data:
                    f.write(data)
                    progress += len(data)
                    print(f'\rDownloaded {progress}/{total_size} bytes', end='')

        print(f'\nDownload complete: {zip_filename}')

print("Finished")


In [None]:
if ENABLE_UNZIP :
    for (year, zip_filename, csv_filename) in zip_files:
        # if year < 2018: continue # WARNING : DISABLE THIS LINE

        if not zipfile.is_zipfile(zip_filename):
            print("Corrupted zip file.")
            break

        if os.path.exists("tmp"):
            shutil.rmtree("tmp")

        print("Unzip : ", zip_filename)
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall("tmp")
        print("Process ..")

        # find the folder in tmp
        items = os.listdir("tmp")
        for folder in items :
            if not os.path.isdir(os.path.join("tmp", folder)) or \
                folder.startswith("__") :
                continue
            
            # find all the folder in this folder
            sub_folders = os.listdir(os.path.join("tmp", folder))
            for sub_folder in sub_folders :
                if not os.path.isdir(os.path.join("tmp", folder, sub_folder)) or \
                    sub_folder.startswith(".") : 
                    continue
                
                sub_item = os.listdir(os.path.join("tmp", folder, sub_folder))
                for leaf in sub_item :
                    # move the csv inside to data
                    from_path = os.path.join("tmp", folder, sub_folder, leaf)
                    dest_path = os.path.join("data", leaf)
                    if os.path.exists(dest_path) : 
                        os.remove(dest_path)

                    shutil.move(from_path, "data")

    if os.path.exists("tmp"):
        shutil.rmtree("tmp")

## Convert the Data

In [None]:
df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .csv("data/201401-citibike-tripdata_1.csv")

In [None]:
df.to_pandas_on_spark().head()