Understanding Hired Rides in NYC
Project prompt

This scaffolding notebook may be used to help setup your final project. It's totally optional whether you make use of this or not.

If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish.

Anything in italics (prose) or comments (in code) is meant to provide you with guidance. Remove the italic lines and provided comments before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading.

All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.

Project Setup

In [1]:
#TODO:// 1. bonus: Test needed; [NOT DONE]
# 2. table type and primary key should be checked; [DONE]
# 3. docuement need to be enhanced. [DONE]
# 4. the way download the data?  [DDNE]
# 5.check the correctness of value;  [DONE]
# 6.bonus: sunset table and one more vis? [NOT DONE]

from typing import List, Dict, Any, Tuple
from tqdm import tqdm 
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from ipywidgets import interact
from scipy.stats import sem, t
from ipywidgets import SelectMultiple
from folium.plugins import HeatMap
import bs4
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import requests
import sqlalchemy as db
import ipywidgets as widgets
import seaborn as sns
import geopandas as gpd
import re 
import os
import folium
import warnings
warnings.filterwarnings("ignore")

In [2]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = "./weather_data"

PARQUET_DIR = "parquet_files"

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

engine = create_engine(DATABASE_URL)

In [3]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

Part 1: Data Preprocessing

Load Taxi Zones

In [4]:
def load_taxi_zones(shapefile:str='./taxi_zones.shp') -> gpd.GeoDataFrame :
    """
    Load the taxi zone shapefile into a GeoDataFrame

    Keyword Arguments:
    shapefile {str} -- the path to the shapefile (default: {'./taxi_zones.shp'})

    Returns:
    gpd.GeoDataFrame -- the GeoDataFrame containing the taxi zones
    """
    gdf = gpd.read_file(shapefile).to_crs(4326)
    gdf['latitude'] = gdf.geometry.centroid.y
    gdf['longitude'] = gdf.geometry.centroid.x
    gdf = gdf[['LocationID', 'latitude', 'longitude']]
    return gdf 

In [5]:
gdf = load_taxi_zones()

DataSourceError: ./taxi_zones.shp: No such file or directory

Calculate Sample Size

In [None]:
def calculate_sample_size(population,z=1.96, e=0.05) -> int:
    """
    calculate the sample size needed for a given population
    :param population: the size of the population
    :param z: the parameter of confidence level
    :param e: the margin of error

    :return: the sample size needed
    """
    n0 = z**2 * 0.5 * 0.5 / e**2
    n = n0 / (1 + (n0 - 1) / population)
    return int(n)

Common Functions

In [None]:
def convert_location_id_to_lat_lon(df:pd.DataFrame) -> pd.DataFrame:
    """
    convert the pickup and dropoff location id to latitude and longitude

    Keyword arguments:
    df -- the dataframe to be converted

    Returns:
    pd.DataFrame -- the dataframe with latitude and longitude columns
    """
    df = df.merge(gdf, left_on='PULocationID', right_on='LocationID', how='left').rename(columns={'latitude':'PULatitude', 'longitude':'PULongitude'}).drop(columns='LocationID')
    df = df.dropna(subset=['PULatitude', 'PULongitude'])
    df = df.merge(gdf, left_on='DOLocationID', right_on='LocationID', how='left').rename(columns={'latitude':'DOLatitude', 'longitude':'DOLongitude'}).drop(columns='LocationID')
    df = df.dropna(subset=['DOLatitude', 'DOLongitude'])
    return df

In [None]:
def filter_taxi_urls(all_urls:List[str]) -> List[str]:
    """
    filter the urls that contain the yellow taxi data

    Keyword arguments:
    all_urls -- the list of urls to be filtered

    Returns:
    List[str] -- the list of filtered urls
    """
    pattern = re.compile(r'(yellow)_tripdata_.*?(202[0-4])-([0-1][0-9])')
    result = []
    for url in all_urls:
        filename = url.split("/")[-1]
        match = pattern.match(filename)
        if match is not None:
            result.append(url)
    return result


def filter_urls(all_urls:List[str],re_pattern :str= '(fhvhv)_tripdata_.*?(202[0-4])-([0-1][0-9])') -> List[str]:
    """
    filter url with given pattern

    Keyword arguments:
    all_urls -- the list of urls to be filtered
    re_pattern -- the pattern to be matched

    Returns:
    List[str] -- the list of filtered urls
    """
    pattern = re.compile(f"{re_pattern}")
    result = []
    for url in all_urls:
        filename = url.split("/")[-1]
        match = pattern.match(filename)
        if match is not None:
            result.append(url)
    return result


Process Taxi Data

In [None]:
def remove_invalid_record_of_taxi_data(df:pd.DataFrame) -> pd.DataFrame:
    """
    remove the data points outside of NYC
    remove the data with a nan pick
    remove the distance less equal than 0
    
    Keyword arguments:

    Returns:
    pd.Dataframe -- the row if it is valid, None otherwise
    """
    valid = (
            (df['PULatitude'].between(NEW_YORK_BOX_COORDS[0][0], NEW_YORK_BOX_COORDS[1][0])) &
            (df['PULongitude'].between(NEW_YORK_BOX_COORDS[0][1], NEW_YORK_BOX_COORDS[1][1])) &
            (df['DOLatitude'].between(NEW_YORK_BOX_COORDS[0][0], NEW_YORK_BOX_COORDS[1][0])) &
            (df['DOLongitude'].between(NEW_YORK_BOX_COORDS[0][1], NEW_YORK_BOX_COORDS[1][1])) &
            (~df['tpep_pickup_datetime'].isna()) &
            (~df['tpep_dropoff_datetime'].isna()) &
            (df['trip_distance'] > 0)
        )
    
    return df[valid]

def get_and_clean_month(url:str)->pd.DataFrame:
    """
    download and clean the data from the given url

    Keyword arguements:
    url -- the url to download the data

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """
    try:
        parquet_file = f"{url.split('/')[-1].strip()}"
        if os.path.exists(f"{PARQUET_DIR}/{parquet_file}"):
            df = pd.read_parquet(f"{PARQUET_DIR}/{parquet_file}")
        else:
            # wget = f"wget {url.strip()} -O {PARQUET_DIR}/{parquet_file}"
            # os.system(wget)

            response = requests.get(url.strip(), stream=True)
            response.raise_for_status()  # Raise an exception for HTTP errors
            file_path = os.path.join(PARQUET_DIR, parquet_file)
            # Save the content to the file
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            df = pd.read_parquet(f"{PARQUET_DIR}/{parquet_file}")
            
        n = calculate_sample_size(df.shape[0])
        df = df.sample(n)
        df = convert_location_id_to_lat_lon(df)
        df = remove_invalid_record_of_taxi_data(df)

        return df
    except Exception as e:
        raise


In [None]:
def get_and_clean_taxi_data(parquet_urls:List[str]) -> pd.DataFrame:
    """
    download and clean the data from the given urls

    Keyword arguements:
    parquet_urls -- the list of urls to download the data

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """

    all_taxi_dataframes = []
    parquet_urls = filter_urls(parquet_urls,'(yellow)_tripdata_.*?(202[0-4])-([0-1][0-9])')
    for parquet_url in tqdm(parquet_urls):
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    return taxi_data

In [None]:
def get_all_urls_from_taxi_page(url:str) -> List[str]:
    """
    get all urls from the page of the given url

    Keyword arguments:
    url -- the url to get all urls from

    Returns:
    List[str] -- the list of all urls
    """
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    urls = [a['href'] for a in soup.find_all('a', href=True)]
    return urls


def find_all_parquet_urls(urls:List[str]) -> List[str]:
    """
    filter the all parquet urls 

    Keyword arguments:
    urls -- the list of urls to be filtered

    Returns:
    List[str] -- the list of filtered urls
    """
    parquet_urls = [url for url in urls if 'parquet' in url]
    return parquet_urls

In [None]:
def get_taxi_data() ->pd.DataFrame:
    """ 
    get the taxi data from the TLC website
    
    Returns:
    pd.DataFrame -- the taxi data
    """
    if not os.path.exists(PARQUET_DIR):
        os.mkdir(PARQUET_DIR)
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    all_parquet_urls = find_all_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data_uncleaned = get_taxi_data()

Normalize Columns

In [None]:
taxi_data_cleaned = taxi_data_uncleaned[['tpep_pickup_datetime','tpep_dropoff_datetime',
                               'trip_distance','fare_amount','extra','mta_tax','improvement_surcharge',
                               'tolls_amount',
                               'PULatitude','PULongitude',
                               'DOLatitude','DOLongitude']]

taxi_data_cleaned['base_fare'] = taxi_data_cleaned['fare_amount'] 
taxi_data_cleaned['tax'] = taxi_data_cleaned['mta_tax'] + taxi_data_cleaned['extra']
taxi_data_cleaned['tolls'] = taxi_data_cleaned['tolls_amount']
taxi_data_cleaned['surcharge'] = taxi_data_cleaned['improvement_surcharge']
taxi_data_cleaned = taxi_data_cleaned.rename(columns={'tpep_pickup_datetime':'trip_pickup_datetime',
                                                      'tpep_dropoff_datetime':'trip_dropoff_datetime',
                                                      'trip_distance':'trip_miles',
                                                      'base_fare':'base_fare',
                                                      'tax':'tax',
                                                      'tolls':'tolls',
                                                      'PULatitude':'pickup_latitude',
                                                      'PULongitude':'pickup_longitude',
                                                      'DOLatitude':'dropoff_latitude',
                                                      'DOLongitude':'dropoff_longitude'})
taxi_data_cleaned = taxi_data_cleaned[['trip_pickup_datetime','trip_dropoff_datetime','trip_miles','base_fare','tax','tolls','surcharge',
                                       'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']]
taxi_data = taxi_data_cleaned

In [None]:
taxi_data.head()

In [None]:
taxi_data.info()

In [None]:
taxi_data.describe()

### Processing Uber Data

In [None]:
def remove_invalid_records_of_uber_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove invalid data points from the Uber dataset.

    Keyword arguments:
    df -- the DataFrame to clean

    Returns:
    pd.DataFrame -- the cleaned DataFrame
    """
    valid = (
        (df['PULatitude'].between(NEW_YORK_BOX_COORDS[0][0], NEW_YORK_BOX_COORDS[1][0])) &
        (df['PULongitude'].between(NEW_YORK_BOX_COORDS[0][1], NEW_YORK_BOX_COORDS[1][1])) &
        (df['DOLatitude'].between(NEW_YORK_BOX_COORDS[0][0], NEW_YORK_BOX_COORDS[1][0])) &
        (df['DOLongitude'].between(NEW_YORK_BOX_COORDS[0][1], NEW_YORK_BOX_COORDS[1][1])) &
        (~df['pickup_datetime'].isna()) &
        (~df['dropoff_datetime'].isna()) &
        (df['trip_miles'] > 0)
    )

    # Return the filtered DataFrame
    return df[valid]


def get_and_clean_uber_month(url:List[str]) -> pd.DataFrame:
    """
    download and clean the data from the given url of uber

    Keyword arguements:
    url -- the url to download the data

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """

    try:
        parquet_file = f"{url.split('/')[-1].strip()}"
        if os.path.exists(f"{PARQUET_DIR}/{parquet_file}"):
            df = pd.read_parquet(f"{PARQUET_DIR}/{parquet_file}")
        else:
            # wget = f"wget {url.strip()} -O {PARQUET_DIR}/{parquet_file}"
            # os.system(wget)

            response = requests.get(url.strip(), stream=True)
            response.raise_for_status()
            file_path = os.path.join(PARQUET_DIR, parquet_file)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            df = pd.read_parquet(f"{PARQUET_DIR}/{parquet_file}")
            
        n = calculate_sample_size(df.shape[0])
        df = df[df['hvfhs_license_num']=='HV0003']
        df = df.sample(n)
        df = convert_location_id_to_lat_lon(df)
        df = remove_invalid_records_of_uber_data(df)

        return df
    except Exception as e:
        raise


In [None]:
def get_and_clean_uber_data(parquet_urls:List[str]) -> pd.DataFrame:
    """
    download and clean the data from the given urls of uber

    Keyword arguements:
    parquet_urls -- the list of urls to download the data

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """

    all_uber_dataframes = []

    uber_parquet_urls = filter_urls(parquet_urls, re_pattern='(fhvhv)_tripdata_.*?(202[0-4])-([0-1][0-9])')

    for parquet_url in tqdm(uber_parquet_urls):
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.concat(all_uber_dataframes)
    return uber_data

In [None]:
def get_uber_data() -> pd.DataFrame :
    """
    get the uber data from the TLC website

    Returns:
    pd.DataFrame -- the uber data
    """

    if not os.path.exists(PARQUET_DIR):
        os.mkdir(PARQUET_DIR)
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    all_parquet_urls = find_all_parquet_urls(all_urls)
    taxi_data = get_and_clean_uber_data(all_parquet_urls)
    return taxi_data

In [None]:
uber_data_uncleaned = get_uber_data()

### Normalize Columns

In [None]:
uber_data_cleaned = uber_data_uncleaned[['pickup_datetime','dropoff_datetime','trip_miles',
                               'base_passenger_fare','tolls','bcf','sales_tax','congestion_surcharge',
                               'PULatitude','PULongitude',
                               'DOLatitude','DOLongitude']]

uber_data_cleaned['base_fare'] = uber_data_cleaned['base_passenger_fare'] #+ uber_data_cleaned['bcf']
uber_data_cleaned['surcharge'] = uber_data_cleaned['congestion_surcharge']
uber_data_cleaned['tax'] = uber_data_cleaned['sales_tax'] +   uber_data_cleaned['bcf'] # TODO: check if this is correct
uber_data_cleaned['tolls'] = uber_data_cleaned['tolls']
uber_data_cleaned = uber_data_cleaned.rename(columns={
    'trip_miles':'trip_distance',
    'pickup_datetime':'trip_pickup_datetime',
    'dropoff_datetime':'trip_dropoff_datetime',
    'base_fare':'base_fare',
    'tax':'tax',
    'tolls':'tolls',
    'surcharge':'surcharge',
    'PULatitude':'pickup_latitude',
    'PULongitude':'pickup_longitude',
    'DOLatitude':'dropoff_latitude',
    'DOLongitude':'dropoff_longitude'
})

uber_data_cleaned = uber_data_cleaned[['trip_pickup_datetime','trip_dropoff_datetime','trip_distance','base_fare','tax','tolls','surcharge',
                                       'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']]

uber_data = uber_data_cleaned

In [None]:
uber_data.head()

In [None]:
uber_data.info()

In [None]:
uber_data.describe()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory:str) -> List[str]:
    """
    list all the csv files in the given directory

    Keyword arguments:
    directory -- the directory to list the csv files

    Returns:
    List[str] -- the list of csv files
    """
    return [ f"{directory}/{file}" for file in os.listdir(directory)]

In [None]:
def clean_month_weather_data_hourly(csv_file:str) -> pd.DataFrame:
    """
    clean and collect the weather data hourly

    Keyword arguments:
    csv_file -- the csv file to be cleaned

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """

    try:
        # parse the DATE and fetch the DATE_hour column and select the required columns
        df = pd.read_csv(csv_file)
        df['DATE'] =  pd.to_datetime(df['DATE'])
        df['DATE_hour'] = df['DATE'].dt.strftime('%Y-%m-%d-%H')
        df = df.drop_duplicates(subset='DATE_hour', keep='first')
        df = df[['DATE_hour','HourlyPrecipitation','HourlyWindSpeed']]
        # replace T with 0.01
        df['HourlyPrecipitation'] = df['HourlyPrecipitation'].replace('T', 0.00001)
        return df
    except Exception as e:
        print(csv_file)

In [None]:
def clean_month_weather_data_daily(csv_file:str) -> pd.DataFrame:
    """
    clean and collect the weather data daily

    Keyword arguments:
    csv_file -- the csv file to be cleaned

    Returns:
    pd.DataFrame -- the cleaned dataframe
    """
    try:
        # using the last record of the day to represent the weather of the day
        df = pd.read_csv(csv_file)
        # df = df[df['DailyWeather'].notna()]#[['DailyWeather','DailyAverageDryBulbTemperature']]
        df['DATE'] =  pd.to_datetime(df['DATE'])
        df['DATE_day'] = df['DATE'].dt.strftime('%Y-%m-%d')
        df['DailyPrecipitation'] = df['DailyPrecipitation'].replace('T', 0.00001).astype(float)
        df['DailySnowfall'] = df['DailySnowfall'].replace('T', 0.00001).astype(float)
        df[ 'DailyAverageWindSpeed'] =  df[ 'DailyAverageWindSpeed'].astype(float)
        # Fill missing values in specified columns with their respective column means
        df[['DailyPrecipitation', 'DailySnowfall', 'DailyAverageWindSpeed']] = (
            df[['DailyPrecipitation', 'DailySnowfall', 'DailyAverageWindSpeed']].apply(
                lambda col: col.fillna(col.mean() if col.mean() is not None else 0)
            )
        )

        df = df.drop_duplicates(subset='DATE_day', keep='last')
        df = df[['DATE_day','DailyPrecipitation','DailySnowfall','DailyAverageWindSpeed']]

        return df
    except Exception as e:
        print(csv_file)
        raise


In [None]:
def load_and_clean_weather_data() -> Tuple[pd.DataFrame,pd.DataFrame]:
    """
    load and clean the weather data

    Returns:
    Tuple[pd.DataFrame,pd.DataFrame] -- the cleaned hourly and daily weather data    
    """

    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
print(list(pd.read_csv('weather_data/2020_weather.csv').columns))

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """

DROP TABLE IF EXISTS hourly_weather;
CREATE TABLE hourly_weather  (
    DATE_hour TEXT PRIMARY KEY,
    HourlyPrecipitation REAL,
    HourlyWindSpeed REAL
);

"""

DAILY_WEATHER_SCHEMA = """

DROP TABLE IF EXISTS daily_weather;
CREATE TABLE daily_weather (
    DATE_day TEXT PRIMARY KEY,
    DailyPrecipitation REAL,
    DailySnowfall REAL,
    DailyAverageWindSpeed REAL
);

"""

TAXI_TRIPS_SCHEMA = """


DROP TABLE IF EXISTS taxi_trips;
CREATE TABLE taxi_trips (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    trip_pickup_datetime TEXT,
    trip_dropoff_datetime TEXT,
    trip_miles REAL,
    base_fare REAL,
    tax REAL,
    tolls REAL,
    surcharge REAL,
    pickup_latitude REAL,
    pickup_longitude REAL,
    dropoff_latitude REAL,
    dropoff_longitude REAL
);

"""

UBER_TRIPS_SCHEMA = """

DROP TABLE IF EXISTS uber_trips;
CREATE TABLE uber_trips (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    trip_pickup_datetime TEXT,
    trip_dropoff_datetime TEXT,
    trip_distance REAL,
    base_fare REAL,
    tax REAL,
    tolls REAL,
    surcharge REAL,
    pickup_latitude REAL,
    pickup_longitude REAL,
    dropoff_latitude REAL,
    dropoff_longitude REAL
);

"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
with open(DATABASE_SCHEMA_FILE, "r") as f:
    sql_script = f.read()

with engine.connect() as connection:
    for statement in sql_script.split(";"):
        statement = statement.strip()
        if statement:  # Skip empty statements
            connection.execute(text(statement))
    

Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict:Dict[str,pd.DataFrame]):
    """
    write the dataframes to the tables in the database

    Keyword arguments:
    table_to_df_dict -- the dictionary of table name to dataframe mapping
    
    """

    for k,v in table_to_df_dict.items():
        v.to_sql(k, con=engine, if_exists='replace', index=False)

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data_cleaned,
    "uber_trips": uber_data_cleaned,
    "hourly_weather": hourly_weather_data,
    "daily_weather": daily_weather_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query:str, outfile:str):
    with open(f"{QUERY_DIRECTORY}/{outfile}", "w") as f:
        f.write(query)

Query 1

In [None]:
QUERY_1_FILENAME = "query1.sql"

QUERY_1 = """
SELECT 
    strftime('%H', trip_pickup_datetime) AS pickup_hour, 
    COUNT(*) AS trip_count
FROM 
    taxi_trips
GROUP BY 
    pickup_hour
ORDER BY 
    trip_count DESC

"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
results1_df = pd.read_sql(QUERY_1, con=engine)
results1_df

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

Query 2

In [None]:
QUERY_2_FILENAME = "query2.sql"

QUERY_2 = """
SELECT 
    strftime('%w', trip_pickup_datetime) AS day_of_week, 
    CASE strftime('%w', trip_pickup_datetime)
        WHEN '0' THEN 'Sunday'
        WHEN '1' THEN 'Monday'
        WHEN '2' THEN 'Tuesday'
        WHEN '3' THEN 'Wednesday'
        WHEN '4' THEN 'Thursday'
        WHEN '5' THEN 'Friday'
        WHEN '6' THEN 'Saturday'
    END AS day_of_week_name,
    COUNT(*) AS trip_count
FROM 
    uber_trips
GROUP BY 
    day_of_week_name
ORDER BY 
    trip_count DESC

"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_2)).fetchall()
results

# or via pandas
results2_df = pd.read_sql(QUERY_2, con=engine)
results2_df

In [None]:
write_query_to_file(QUERY_2, QUERY_2_FILENAME)

Query 3

In [None]:
QUERY_3_FILENAME = "query3.sql"

QUERY_3 = """
WITH combined_trips AS (
    SELECT 
        trip_miles AS trip_distance,
        trip_pickup_datetime
    FROM 
        taxi_trips
    WHERE 
        strftime('%Y-%m', trip_pickup_datetime) = '2024-01'
    
    UNION ALL
    
    SELECT 
        trip_distance AS trip_distance,
        trip_pickup_datetime
    FROM 
        uber_trips
    WHERE 
        strftime('%Y-%m', trip_pickup_datetime) = '2024-01'
),
sorted_trips AS (
    SELECT 
        trip_distance,
        ROW_NUMBER() OVER (ORDER BY trip_distance) AS row_num,
        COUNT(*) OVER () AS total_rows
    FROM 
        combined_trips
),
percentile_row AS (
    SELECT 
        trip_distance
    FROM 
        sorted_trips
    WHERE 
        row_num = CAST(0.95 * total_rows AS INTEGER)
)
SELECT 
    trip_distance AS percentile_95
FROM 
    percentile_row;
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_3)).fetchall()
results

# or via pandas
results3_df = pd.read_sql(QUERY_3, con=engine)
results3_df

In [None]:
write_query_to_file(QUERY_3, QUERY_3_FILENAME)