In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option('display.max_columns', 30)

import requests

In [2]:
"""
1. get the data from S3
2. weather data transformations
3. taxi_trips transformations
4. update payment_type master
5. update company master
6. update taxi_trips with payment_type and company ids (replace the string values with the ids from the latest master tables)
7. upload weather data to S3
8. upload taxi data to S3
9. upload the newest payment_type and company master tables to S3
"""

'\n1. get the data from S3\n2. weather data transformations\n3. taxi_trips transformations\n4. update payment_type master\n5. update company master\n6. update taxi_trips with payment_type and company ids (replace the string values with the ids from the latest master tables)\n7. upload weather data to S3\n8. upload taxi data to S3\n9. upload the newest payment_type and company master tables to S3\n'

#### taxi_trips transformation codes

In [3]:
current_datetime = datetime.now() - relativedelta(months=2)

formatted_date = current_datetime.strftime("%Y-%m-%d")

url = f"https://data.cityofchicago.org/resource/ajtu-isnz.json?$where=trip_start_timestamp >= '{formatted_date}T00:00:00' AND trip_start_timestamp <= '{formatted_date}T23:59:59'&$limit=30000"

response = requests.get(url)

data = response.json()

In [6]:
taxi_trips = pd.DataFrame(data)

In [None]:
taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
taxi_trips.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

taxi_trips.dropna(inplace=True)

taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("h")

In [7]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,8ff9b24f78b2aec6f368a3c0121f4f4fcc1c1c6b,ea9c7f865233f880e5f00abb728092901eeaf52c85a8c1...,2025-01-24T23:45:00.000,2025-01-25T00:15:00.000,1380,7.3,8,21.0,21.5,0.0,0,0,21.5,Cash,Choice Taxi Association,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.938666196,-87.711210593,"{'type': 'Point', 'coordinates': [-87.71121059...",,
1,fbf29434ac0fc77238d9a781842aa93b977373df,66242a333a4029bc3a42b01401e7a2b39f3940777b67a6...,2025-01-24T23:45:00.000,2025-01-24T23:45:00.000,282,2.11,76,,8.0,2.7,0,5,16.2,Credit Card,Sun Taxi,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",,,,,
2,f7e849bcd4aa305b52de38a867370fa275198885,c53dba6f2d4174b792e113023de74f54e5af52ba87cf1a...,2025-01-24T23:45:00.000,2025-01-24T23:45:00.000,66,0.14,70,65.0,3.75,0.0,0,0,3.75,Cash,Flash Cab,41.745757713,-87.708365704,"{'type': 'Point', 'coordinates': [-87.70836570...",41.769778059,-87.726929842,"{'type': 'Point', 'coordinates': [-87.72692984...",,
3,f1b72dbc170f08b63e752b03d68fa0092ea7fea0,3f959f9e67fb058ff7179003776c6db313d0faee042479...,2025-01-24T23:45:00.000,2025-01-25T00:00:00.000,644,1.93,28,8.0,9.0,2.0,0,0,11.5,Credit Card,Sun Taxi,41.879255084,-87.642648998,"{'type': 'Point', 'coordinates': [-87.64264899...",41.899155613,-87.626210532,"{'type': 'Point', 'coordinates': [-87.62621053...",17031281900.0,17031081201.0
4,f06751da1ac5530fed50ebe2a6e080db547d8e88,f1e22a147bdb15246c72c3eca77f684452129a6f3203e2...,2025-01-24T23:45:00.000,2025-01-24T23:45:00.000,378,1.34,8,32.0,7.39,0.0,0,0,7.89,Mobile,City Service,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",,


#### taxi_trips transformation function

In [None]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations on the taxi_trips DataFrame

    Parameters
    ---------- 
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data

    Returns
    -------
    pd.DataFrame
        The transformed DataFrame
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips should be a DataFrame")

    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract", 
                     "pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                               "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

    taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("h")

    return taxi_trips

#### company update codes

In [13]:
company_master = taxi_trips["company"].drop_duplicates().reset_index(drop=True)
company_master = pd.DataFrame({
    "company_id": range(1, len(company_master) + 1),
    "company": company_master
})
company_master.tail()

Unnamed: 0,company_id,company
32,33,5167 - 71969 5167 Taxi Inc
33,34,6574 - Babylon Express Inc.
34,35,U Taxicab
35,36,Tac - Blue Diamond Dispatch
36,37,Metro Jet Taxi A.


In [14]:
new_company_data = [
    { "company": "6574 - Babylon Express Inc." },
    { "company": "Y" },
    { "company": "X" }
]

new_company_mapping = pd.DataFrame(new_company_data)
new_company_mapping

Unnamed: 0,company
0,6574 - Babylon Express Inc.
1,Y
2,X


In [15]:
company_max_id = company_master["company_id"].max()
company_max_id

np.int64(37)

In [16]:
new_companies_list = []

# one line
new_companies_list = [company for company in new_company_mapping["company"].values if company not in company_master["company"].values]

new_companies_list

['Y', 'X']

In [17]:
new_companies_df = pd.DataFrame({
    "company_id": range(company_max_id + 1, company_max_id + 1 + len(new_companies_list)),
    "company": new_companies_list
})
new_companies_df

Unnamed: 0,company_id,company
0,38,Y
1,39,X


In [18]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
updated_company_master.tail()

Unnamed: 0,company_id,company
34,35,U Taxicab
35,36,Tac - Blue Diamond Dispatch
36,37,Metro Jet Taxi A.
37,38,Y
38,39,X


In [21]:
def update_company_master(taxi_trips: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    """
    Update the company_master DataFrame with new companies from the taxi_trips DataFrame
    
    Paramters
    ---------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    company_master: pd.DataFrame
        The DataFrame containing the company master data

    Returns
    -------
    pd.DataFrame
        The updated company_master DataFrame
    """
    company_max_id = company_master["company_id"].max()

    new_companies_list = [company for company in taxi_trips["company"].values if company not in company_master["company"].values]

    new_companies_df = pd.DataFrame({
        "company_id": range(company_max_id + 1, company_max_id + 1 + len(new_companies_list)),
        "company": new_companies_list
    })

    updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)

    return updated_company_master

In [None]:
updated_company_master = update_company_master(taxi_trips, company_master)
updated_company_master.tail()

Unnamed: 0,company_id,company
34,35,U Taxicab
35,36,Tac - Blue Diamond Dispatch
36,37,Metro Jet Taxi A.
37,38,Y
38,39,X


In [25]:
taxi_trips_company_only = pd.DataFrame({
    "company_id": [1,2,3],
    "company": ["6574 - Babylon Express Inc.", "Y", "X"]
})
taxi_trips_company_only

Unnamed: 0,company_id,company
0,1,6574 - Babylon Express Inc.
1,2,Y
2,3,X
