In [5]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option('display.max_columns', 30)

import requests

In [6]:
"""
1. get the data from S3
2. weather data transformations
3. taxi_trips transformations
4. update payment_type master
5. update company master
6. update taxi_trips with payment_type and company ids (replace the string values with the ids from the latest master tables)
7. upload weather data to S3
8. upload taxi data to S3
9. upload the newest payment_type and company master tables to S3
"""

'\n1. get the data from S3\n2. weather data transformations\n3. taxi_trips transformations\n4. update payment_type master\n5. update company master\n6. update taxi_trips with payment_type and company ids (replace the string values with the ids from the latest master tables)\n7. upload weather data to S3\n8. upload taxi data to S3\n9. upload the newest payment_type and company master tables to S3\n'

#### taxi_trips transformation codes

In [7]:
current_datetime = datetime.now() - relativedelta(months=2)

formatted_date = current_datetime.strftime("%Y-%m-%d")

url = f"https://data.cityofchicago.org/resource/ajtu-isnz.json?$where=trip_start_timestamp >= '{formatted_date}T00:00:00' AND trip_start_timestamp <= '{formatted_date}T23:59:59'&$limit=30000"

response = requests.get(url)

data = response.json()

In [8]:
taxi_trips = pd.DataFrame(data)

In [9]:
taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
taxi_trips.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

taxi_trips.dropna(inplace=True)

taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("h")

In [10]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,ebb30c51b01d782a4f16b9ad15ba49cf2046760d,4ea76937237d234148f31343797e28d4616f50891cb565...,2025-01-25T23:45:00.000,2025-01-25T23:45:00.000,11,0.0,32,32,3.25,3.15,0,12,18.9,Credit Card,Sun Taxi,41.878865584,-87.625192142,41.878865584,-87.625192142,2025-01-25 23:00:00
1,6ae65f0e73d02d40283e2f4c2fc975f619bc38b0,2f4e29a2c83f6b9c2b8ce735865fdddf49627f8d60bbff...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,866,11.7,32,12,29.75,0.0,0,0,29.75,Prcard,5 Star Taxi,41.878865584,-87.625192142,41.993930128,-87.758353588,2025-01-25 23:00:00
2,6eb83ae862ec1bf32565e2787ad9ef97bf7017f6,199f852b9b7f49df5426c931ac14b4e746e6f946021af3...,2025-01-25T23:45:00.000,2025-01-25T23:45:00.000,345,0.66,8,8,5.5,0.0,0,1,6.5,Cash,Flash Cab,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-01-25 23:00:00
3,7080222229c3199a87d473b971dda4091aad0d39,41235584d1969c7a37382dbbac8e063edfc824f5bdddde...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,849,2.5,28,8,9.84,2.0,0,0,12.34,Mobile,5 Star Taxi,41.88528132,-87.6572332,41.909495669,-87.630963601,2025-01-25 23:00:00
4,7c01245278e74a51cb82509ca365b69ce9ba5c53,008dda45db57cb6daa679a86ce8c8149ddc05446d545b1...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,771,2.25,28,24,9.5,2.74,0,0,12.74,Mobile,Taxicab Insurance Agency Llc,41.874005383,-87.66351755,41.901206994,-87.676355989,2025-01-25 23:00:00


#### taxi_trips transformation function

In [11]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations on the taxi_trips DataFrame

    Parameters
    ---------- 
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data

    Returns
    -------
    pd.DataFrame
        The transformed DataFrame
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips should be a DataFrame")

    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract", 
                     "pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                               "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

    taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("h")

    return taxi_trips

#### company update codes

In [12]:
company_master = taxi_trips["company"].drop_duplicates().reset_index(drop=True)
company_master = pd.DataFrame({
    "company_id": range(1, len(company_master) + 1),
    "company": company_master
})
company_master.tail()

Unnamed: 0,company_id,company
30,31,Tac - Yellow Non Color
31,32,Metro Jet Taxi A.
32,33,3556 - 36214 RC Andrews Cab
33,34,Petani Cab Corp
34,35,4053 - 40193 Adwar H. Nikola


In [13]:
new_company_data = [
    { "company": "6574 - Babylon Express Inc." },
    { "company": "Y" },
    { "company": "X" }
]

new_company_mapping = pd.DataFrame(new_company_data)
new_company_mapping

Unnamed: 0,company
0,6574 - Babylon Express Inc.
1,Y
2,X


In [14]:
company_max_id = company_master["company_id"].max()
company_max_id

np.int64(35)

In [15]:
new_companies_list = []

# one line
new_companies_list = [company for company in new_company_mapping["company"].values if company not in company_master["company"].values]

new_companies_list

['Y', 'X']

In [16]:
new_companies_df = pd.DataFrame({
    "company_id": range(company_max_id + 1, company_max_id + 1 + len(new_companies_list)),
    "company": new_companies_list
})
new_companies_df

Unnamed: 0,company_id,company
0,36,Y
1,37,X


In [17]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
updated_company_master.tail()

Unnamed: 0,company_id,company
32,33,3556 - 36214 RC Andrews Cab
33,34,Petani Cab Corp
34,35,4053 - 40193 Adwar H. Nikola
35,36,Y
36,37,X


In [18]:
def update_company_master(taxi_trips: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    """
    Update the company_master DataFrame with new companies from the taxi_trips DataFrame
    
    Paramters
    ---------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    company_master: pd.DataFrame
        The DataFrame containing the company master data

    Returns
    -------
    pd.DataFrame
        The updated company_master DataFrame
    """
    company_max_id = company_master["company_id"].max()

    new_companies_list = [company for company in taxi_trips["company"].values if company not in company_master["company"].values]

    new_companies_df = pd.DataFrame({
        "company_id": range(company_max_id + 1, company_max_id + 1 + len(new_companies_list)),
        "company": new_companies_list
    })

    updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)

    return updated_company_master

In [19]:
updated_company_master = update_company_master(taxi_trips, company_master)
updated_company_master.tail()

Unnamed: 0,company_id,company
30,31,Tac - Yellow Non Color
31,32,Metro Jet Taxi A.
32,33,3556 - 36214 RC Andrews Cab
33,34,Petani Cab Corp
34,35,4053 - 40193 Adwar H. Nikola


In [20]:
taxi_trips_company_only = pd.DataFrame({
    "company_id": [1,2,3],
    "company": ["6574 - Babylon Express Inc.", "Y", "X"]
})
taxi_trips_company_only

Unnamed: 0,company_id,company
0,1,6574 - Babylon Express Inc.
1,2,Y
2,3,X


### payment_type_master codes

In [33]:
payment_type_master = taxi_trips["payment_type"].drop_duplicates().reset_index(drop=True)
payment_type_master = pd.DataFrame({
    "payment_type_id": range(1, len(payment_type_master) + 1),
    "payment_type": payment_type_master
})

taxi_trips_payment_type_only = pd.DataFrame({
    "payment_type_id": [1,2,3],
    "payment_type": ["Credit Card", "Y", "X"]
})
taxi_trips_payment_type_only

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,Y
2,3,X


In [23]:
def update_payment_type_master(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame) -> pd.DataFrame:
    """
    Update the payment_type_master DataFrame with new companies from the taxi_trips DataFrame
    
    Paramters
    ---------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    payment_type_master: pd.DataFrame
        The DataFrame containing the payment_type master data

    Returns
    -------
    pd.DataFrame
        The updated payment_type_master DataFrame
    """
    payment_type_max_id = payment_type_master["payment_type_id"].max()

    new_payment_types_list = [payment_type for payment_type in taxi_trips["payment_type"].values if payment_type not in payment_type_master["payment_type"].values]

    new_payment_types_df = pd.DataFrame({
        "payment_type_id": range(payment_type_max_id + 1, payment_type_max_id + 1 + len(new_payment_types_list)),
        "payment_type": new_payment_types_list
    })

    updated_payment_type_master = pd.concat([payment_type_master, new_payment_types_df], ignore_index=True)

    return updated_payment_type_master

In [28]:
updated_payment_type_master = update_payment_type_master(taxi_trips_payment_type_only, payment_type_master)
updated_payment_type_master.tail()

Unnamed: 0,payment_type_id,payment_type
5,6,No Charge
6,7,Dispute
7,8,Credit card
8,9,Y
9,10,X


### Create a generic update master function

In [None]:
def update_master(taxi_trips: pd.DataFrame, master: pd.DataFrame, id_column: str, value_column: str) -> pd.DataFrame:
    """
    Update the master DataFrame with new companies from the taxi_trips DataFrame
    
    Paramters
    ---------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    master: pd.DataFrame
        The DataFrame containing the master data
    id_column: str
        The id column of the master DataFrame.
    value_column: str
        The name of the column in master_df containing the values.

    Returns
    -------
    pd.DataFrame
        The updated master DataFrame
    """
    max_id = master[id_column].max()

    new_values_list = list(set(taxi_trips[value_column].values) - set(master[value_column].values))
    new_values_df = pd.DataFrame({
        id_column: range(max_id + 1, max_id + 1 + len(new_values_list)),
        value_column: new_values_list
    })

    updated_master = pd.concat([master, new_values_df], ignore_index=True)

    return updated_master

In [None]:
# test_payment_type_master = update_master(taxi_trips_payment_type_only, payment_type_master, "payment_type_id", "payment_type")
# test_company_master = update_master(taxi_trips_company_only, company_master, "company_id", "company")

#### Updating taxi_trips with the most recent company_master and payment_type master codes

In [37]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,ebb30c51b01d782a4f16b9ad15ba49cf2046760d,4ea76937237d234148f31343797e28d4616f50891cb565...,2025-01-25T23:45:00.000,2025-01-25T23:45:00.000,11,0.0,32,32,3.25,3.15,0,12,18.9,Credit Card,Sun Taxi,41.878865584,-87.625192142,41.878865584,-87.625192142,2025-01-25 23:00:00
1,6ae65f0e73d02d40283e2f4c2fc975f619bc38b0,2f4e29a2c83f6b9c2b8ce735865fdddf49627f8d60bbff...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,866,11.7,32,12,29.75,0.0,0,0,29.75,Prcard,5 Star Taxi,41.878865584,-87.625192142,41.993930128,-87.758353588,2025-01-25 23:00:00
2,6eb83ae862ec1bf32565e2787ad9ef97bf7017f6,199f852b9b7f49df5426c931ac14b4e746e6f946021af3...,2025-01-25T23:45:00.000,2025-01-25T23:45:00.000,345,0.66,8,8,5.5,0.0,0,1,6.5,Cash,Flash Cab,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-01-25 23:00:00
3,7080222229c3199a87d473b971dda4091aad0d39,41235584d1969c7a37382dbbac8e063edfc824f5bdddde...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,849,2.5,28,8,9.84,2.0,0,0,12.34,Mobile,5 Star Taxi,41.88528132,-87.6572332,41.909495669,-87.630963601,2025-01-25 23:00:00
4,7c01245278e74a51cb82509ca365b69ce9ba5c53,008dda45db57cb6daa679a86ce8c8149ddc05446d545b1...,2025-01-25T23:45:00.000,2025-01-26T00:00:00.000,771,2.25,28,24,9.5,2.74,0,0,12.74,Mobile,Taxicab Insurance Agency Llc,41.874005383,-87.66351755,41.901206994,-87.676355989,2025-01-25 23:00:00


In [46]:
def update_taxi_trips_with_master_data(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    '''
    Update the taxi_trips DataFrame with the payment_type_master and company_master ids

    Parameters
    ----------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    payment_type_master: pd.DataFrame
        The DataFrame containing the payment_type master table
    company_master: pd.DataFrame
        The DataFrame containing the company master table

    Returns
    -------
    pd.DataFrame
        The updated taxi_trips DataFrame with the payment_type and company ids, without the string values
    '''
    taxi_trips_id  = taxi_trips.merge(payment_type_master, on="payment_type")
    taxi_trips_id  = taxi_trips_id.merge(company_master, on="company")
    
    taxi_trips_id.drop(["payment_type", "company"], axis=1, inplace=True)
    
    return taxi_trips_id

In [48]:
taxi_trips_id = update_taxi_trips_with_master_data(taxi_trips, updated_payment_type_master, updated_company_master)
taxi_trips_id.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
2828,d19fbf70d5e5d19168fccf5758c2075bc6a0de15,de9289a1fce135051f6e9206044f8211c86a1a6acd68d4...,2025-01-25T17:45:00.000,2025-01-25T18:30:00.000,3404,15.71,76,7,42.0,9.6,0,5.5,57.6,41.97907082,-87.903039661,41.921778356,-87.641459759,2025-01-25 17:00:00,1,3
8261,17739bc0ee8276a6e5ab19b47a5c785af726ad27,3497d2b935bf09123c257fd1ff03b4ca266e067fac8e97...,2025-01-25T07:00:00.000,2025-01-25T07:15:00.000,1200,16.9,76,28,41.5,5.0,0,4.0,50.5,41.980264315,-87.913624596,41.874005383,-87.66351755,2025-01-25 07:00:00,1,8
1767,ee2d2329cbe8710d359349e2b7e68f36e7676cc0,d6e1a9e103336c396201abe9ceb00795fcd41e14ccbf54...,2025-01-25T19:30:00.000,2025-01-25T19:45:00.000,727,0.0,7,28,9.44,2.0,0,0.0,11.94,41.922686284,-87.649488729,41.874005383,-87.66351755,2025-01-25 19:00:00,4,3
6818,e06b4c1aca6a700a1d342721b31d88a5873f036f,358f71d4ff3bc75e603103b7fc0b5ddcf0c0ca606d8452...,2025-01-25T11:00:00.000,2025-01-25T11:15:00.000,182,0.61,8,8,4.5,0.0,0,0.0,4.5,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-01-25 11:00:00,3,4
2979,f6002fb8afb6997b54239972897229a7a3155ed0,112a65e84475c4cf57643414f05b68860c47f719e07e5d...,2025-01-25T17:30:00.000,2025-01-25T17:30:00.000,720,1.8,8,32,9.0,0.0,0,0.0,9.0,41.899155613,-87.626210532,41.880994471,-87.632746489,2025-01-25 17:00:00,3,8


### weather transformation function

In [None]:
def transform_weather_data(weather_data: JSON) -> pd.DataFrame:
    """
    Perform transformations on the weather data
    
    Parameters
    ----------
    weather_data: dict
        The daily weather data from the Open Meteo API

    Returns
    -------
    pd.DataFrame
        The transformed weather data DataFrame
    """
    weather_data_filtered = {
        "datetime": weather_data['hourly']['time'],
        "temperature": weather_data['hourly']['temperature_2m'],
        "wind_speed": weather_data['hourly']['wind_speed_10m'],
        "rain": weather_data['hourly']['rain'],
        "precipitation": weather_data['hourly']['precipitation']
    }

    weather_df = pd.DataFrame(weather_data_filtered)

    weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

    return weather_df

In [54]:
# Get weather data and create a dataframe out of it
current_datetime = datetime.now() - relativedelta(months=2)
formatted_date = current_datetime.strftime("%Y-%m-%d")

url = 'https://archive-api.open-meteo.com/v1/era5'

params = {
    'start_date': formatted_date,
    'end_date': formatted_date,
    'latitude': 41.85,
    'longitude': -87.65,
    "hourly": "temperature_2m,wind_speed_10m,rain,precipitation",
}

response = requests.get(url, params=params)
weather_data = response.json()

transform_weather_data(weather_data).tail()

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
19,2025-01-25 19:00:00,1.9,16.4,0.0,0.0
20,2025-01-25 20:00:00,2.7,19.5,0.0,0.0
21,2025-01-25 21:00:00,2.9,21.8,0.0,0.0
22,2025-01-25 22:00:00,2.4,20.8,0.0,0.0
23,2025-01-25 23:00:00,1.1,20.9,0.0,0.0
