## Part 0: Setup
In this part we set up some basic environment and variables needed for this project.

In [1]:
import os
import re
import bs4
import math
import requests
import warnings
import numpy as np
import pandas as pd
import keplergl as kg
from scipy import stats
import sqlalchemy as db
import geopandas as gpd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [3]:
# Variables needed
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
UBER_CSV = "uber_rides_sample.csv"
WEATHER_CSV = ["weather-2009.csv", "weather-2010.csv", "weather-2011.csv", 
               "weather-2012.csv", "weather-2013.csv", "weather-2014.csv", "weather-2015.csv"]
ZONE_PATH = "taxi_zones.shp"

NY_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))

DATABASE = "sqlite:///project.db"
SCHEMA_FILE = "schema.sql"

In [4]:
Taxi_zone = gpd.read_file(ZONE_PATH)
Taxi_zone = Taxi_zone.to_crs(4326)
Taxi_zone['longitude'] = Taxi_zone.centroid.x  
Taxi_zone['latitude'] = Taxi_zone.centroid.y

## Part 1: Data Preprocessing
### Yellow Taxi trip data: Downloading, Cleaning, Sampling

In [5]:
def get_taxi_html() -> bytes:
    response = requests.get(TAXI_URL)
    html = response.content
    return html

In [6]:
def find_taxi_parquet_links() -> list:
    links = []
    pattern = r"yellow_tripdata_2009|yellow_tripdata_201[0-4]|yellow_tripdata_2015-0[1-6]"
    soup = bs4.BeautifulSoup(get_taxi_html(),'html.parser')
    for a in soup.find_all("a",href = True):
        link_text = a.get("href")
        matches = re.findall(pattern,link_text)
        if matches:
            links.append(link_text)
    return links

In [7]:
def monthly_taxi_data_download_clean_sample(url: str) -> pd.core.frame.DataFrame:
    parquet_name = url.split("/")[-1]

    # download if it doesn't exist
    if not os.path.exists(parquet_name):
        print(f"Downloading parquet for {parquet_name[16:23]}.")
        file = requests.get(url)
        with open(parquet_name , "wb") as f:
            f.write(file.content)
    
    # load data from parquet file
    data = pd.read_parquet(parquet_name)
    print(f"Cleaning data for {parquet_name[16:23]}.")
    
    # 为了不占用太多内存，读一个删一个，最后提交之前要删掉
    os.remove(parquet_name) 
    print(f"Parquet for {parquet_name[16:23]} is removed.")
    
    # looking up the latitude and longitude for some months where only location IDs are given for pickups and dropoffs
    # keep NaNs if exists
    if "PULocationID" in data.columns:
        data["pickup_latitude"] = data["PULocationID"].map(Taxi_zone["latitude"], na_action = "ignore")
        data["pickup_longitude"] = data["PULocationID"].map(Taxi_zone["longitude"], na_action = "ignore")
        data["dropoff_latitude"] = data["DOLocationID"].map(Taxi_zone["latitude"], na_action = "ignore")
        data["dropoff_longitude"] = data["DOLocationID"].map(Taxi_zone["longitude"], na_action = "ignore")
    
    # normalize column names
    rename_dict = {
        "VendorID" : "vendor_id",
        "tpep_pickup_datetime" : "pickup_datetime",
        "tpep_dropoff_datetime" : "dropoff_datetime",
        "RatecodeID" : "rate_code",
        "Trip_Pickup_DateTime" : "pickup_datetime",
        "Trip_Dropoff_DateTime" : "dropoff_datetime",
        "Start_Lon" : "pickup_longitude",
        "Start_Lat" : "pickup_latitude",
        "End_Lon" : "dropoff_longitude",
        "End_Lat" : "dropoff_latitude",
        "Fare_Amt" : "fare_amount",
        "Tip_Amt" : "tip_amount",
        "Tolls_Amt" : "tolls_amount",
        "Total_Amt" : "total_amount"
    }
    data.rename(columns = rename_dict, inplace = True)
    
    # remove the trips that the location IDs are be valid
    data.dropna(subset=["pickup_latitude","pickup_longitude","dropoff_latitude","dropoff_longitude"],inplace = True)
    
    # remove invalid data points
    data = data[data["total_amount"] > 0]
    
    # normalize and use appropriate column types for the respective data
    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
    data["dropoff_datetime"] = pd.to_datetime(data["dropoff_datetime"])
    data = data.astype({"pickup_latitude": "float64","pickup_longitude": "float64",\
                        "dropoff_latitude": "float64","dropoff_longitude": "float64","tip_amount": "float64"})
    
    # remove unnecessary columns and only keeping columns needed
    data = data[["pickup_datetime","pickup_latitude","pickup_longitude","dropoff_latitude","dropoff_longitude","tip_amount"]]
    
    # remove trips that start and/or end outside of NY
    data = data[(data["pickup_latitude"] >= NY_COORDS[0][0]) & (data["pickup_latitude"] <= NY_COORDS[1][0])]
    data = data[(data["pickup_longitude"] >= NY_COORDS[0][1]) & (data["pickup_longitude"] <= NY_COORDS[1][1])]
    data = data[(data["dropoff_latitude"] >= NY_COORDS[0][0]) & (data["dropoff_latitude"] <= NY_COORDS[1][0])]
    data = data[(data["dropoff_longitude"] >= NY_COORDS[0][1]) & (data["dropoff_longitude"] <= NY_COORDS[1][1])]
    
    # Sampling
    # Uber dataset consists of 200000 data points
    # Therefore, we need 200000/78 ~ 2564 data points from each month
    data = data.sample(2564)

    return data


### Yellow Taxi trip data: Filling (Distance)

We calculate the distance between pickup location and dropoff location using the Haversine Formula:

![](https://user-images.githubusercontent.com/2789198/27240436-e9a459da-52d4-11e7-8f84-f96d0b312859.png)

where $\lambda$ and $\phi$ are the `longitude` and `latitude` of locations respectively, $r$ is the radius of earth.

In [8]:
def calculate_distance(pu_coord: pd.core.frame.DataFrame, do_coord: pd.core.frame.DataFrame) -> pd.core.series.Series:
    
    pick_lon = pu_coord["pickup_longitude"].map(math.radians)
    pick_lat = pu_coord["pickup_latitude"].map(math.radians)
    drop_lon = do_coord["dropoff_longitude"].map(math.radians)
    drop_lat = do_coord["dropoff_latitude"].map(math.radians)
    
    delta_lat = drop_lat - pick_lat
    delta_lon = drop_lon - pick_lon
    
    # Take the average earth radius (km) as r
    r = 6371
    part_formula = ((delta_lat/2).map(math.sin))**2 + (pick_lat.map(math.cos))*(drop_lat.map(math.cos))*((delta_lon/2).map(math.sin))**2
    dist = 2 * r * part_formula.map(math.sqrt).map(math.asin)
    
    return dist.astype("float64")

In [9]:
def filling_distance(data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    pu_coord = data[["pickup_longitude","pickup_latitude"]]
    do_coord = data[["dropoff_longitude","dropoff_latitude"]]
    data["distance"] = calculate_distance(pu_coord, do_coord)
    
    return data

In [10]:
def all_taxi_data(urls: list) -> pd.core.frame.DataFrame:
    all_taxi_df = []
    for url in urls:
        data = monthly_taxi_data_download_clean_sample(url)
        data = filling_distance(data)
        all_taxi_df.append(data)
    
    all_data = pd.concat(all_taxi_df)
    
    return all_data