In [103]:
from io import StringIO
from dotenv import load_dotenv

import os
import boto3
import pandas as pd

pd.set_option("display.max_columns", 50)


In [104]:
aws_access_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [105]:
load_dotenv()

True

In [106]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an s3 bucket.
    
    Parameters
    ----------
    bucket : str
        The bucket where the files at.
    path : str
        The folders to the file.
    filename : str
        Name of the file.
        
    Returns
    -------
    pd.DataFrame
        A DataFrame of the Downloaded file.
    
    """
    s3 = boto3.client("s3", aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_key)
    
    full_path = f"{path}{filename}"
    
    object = s3.get_object(Bucket = bucket, Key = full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [107]:
s3 = boto3.client("s3", aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_key)
bucket = "cubix-chicago-taxi-vti"

community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"



In [108]:
community_areas = read_csv_from_s3(bucket, community_areas_path, "community_areas_master.csv")
company = read_csv_from_s3(bucket, company_path, "company_master.csv")
date = read_csv_from_s3(bucket, date_path, "date_dimension.csv")
payment_type = read_csv_from_s3(bucket = bucket, path = payment_type_path, filename = "payment_type_master.csv")

In [109]:
trips_list = []
weather_list = []

In [110]:

for file in s3.list_objects(Bucket = bucket, Prefix = taxi_trips_path)["Contents"]:
    taxi_trip_key = file["Key"]
    
    if taxi_trip_key.split("/") [-1].strip() != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename =  taxi_trip_key.split("/") [-1]
            trip = read_csv_from_s3(bucket, taxi_trips_path, filename)

            trips_list.append(trip)
            print(f"{filename} has been added.")

          
                        

taxi_2024-06-22.csv has been added.
taxi_2024-07-01.csv has been added.


In [111]:
trips = pd.concat(trips_list, ignore_index = True)

In [112]:
trips.shape

(17074, 20)

In [113]:

for file in s3.list_objects(Bucket = bucket, Prefix = weather_path)["Contents"]:
    
    weather_key = file["Key"]
    
    if weather_key.split("/") [-1].strip() != "":
        if weather_key.split(".")[1] == "csv":
        
            filename = weather_key.split("/") [-1]

            weather_daily = read_csv_from_s3(bucket, weather_path, filename)

            weather_list.append(weather_daily)
            print(f"{filename} has been added.")



weather_2024-06-06.csv has been added.
weather_2024-06-07.csv has been added.
weather_2024-06-08.csv has been added.
weather_2024-06-09.csv has been added.
weather_2024-06-10.csv has been added.
weather_2024-06-11.csv has been added.
weather_2024-06-12.csv has been added.
weather_2024-06-13.csv has been added.
weather_2024-06-14.csv has been added.
weather_2024-06-15.csv has been added.
weather_2024-06-16.csv has been added.
weather_2024-06-17.csv has been added.
weather_2024-06-18.csv has been added.
weather_2024-06-19.csv has been added.
weather_2024-06-20.csv has been added.
weather_2024-06-21.csv has been added.
weather_2024-06-22.csv has been added.


In [114]:
weather = pd.concat(weather_list, ignore_index = True)

In [115]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       408 non-null    object 
 1   temperature    408 non-null    float64
 2   wind_speed     408 non-null    float64
 3   rain           408 non-null    float64
 4   precipitation  408 non-null    float64
dtypes: float64(4), object(1)
memory usage: 16.1+ KB


#### Join them together

In [169]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on="datetime", how ="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [170]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how ="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [171]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how ="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [172]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how ="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [173]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how ="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [174]:
date["date"] = pd.to_datetime(date["date"])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] =pd.to_datetime(trips_full["trip_start_date"])

In [175]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on="date", how ="inner")
trips_full = trips_full.drop(columns=["date"])

In [176]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name,trip_start_date,year,month,day,day_of_week,is_weekend
0,5b95d44e63e4595bfa5d2100330ed1ec5832da64,a030e6189cee598b516978caa72ad33839a0bccece8353...,2024-06-22 23:45:00,2024-06-23T00:15:00.000,1724,17.62,43.75,12.31,0.0,5.0,61.56,41.979071,-87.90304,41.884987,-87.620993,2024-06-22 23:00:00,34.1,33.2,0.0,0.0,Sun Taxi,Credit Card,O'Hare,Loop,2024-06-22,2024,6,22,6,True
1,5d535d3567645217e25b00b97a0d9de78af03a15,bb4e75d3065311c33024a434640731c43fd2cf9e4482eb...,2024-06-22 23:45:00,2024-06-22T23:45:00.000,3,0.0,13.75,0.0,0.0,0.0,13.75,41.899602,-87.633308,41.899602,-87.633308,2024-06-22 23:00:00,34.1,33.2,0.0,0.0,Tac - Yellow Cab Association,Cash,Near North Side,Near North Side,2024-06-22,2024,6,22,6,True
2,5e564af11d417385a6f99c47f8251fac1779575d,7b50110eaa441d38f0dba8a06ee13d5cb2854661aafb85...,2024-06-22 23:45:00,2024-06-22T23:45:00.000,713,2.77,10.85,2.97,0.0,0.0,13.82,41.922686,-87.649489,41.944227,-87.655998,2024-06-22 23:00:00,34.1,33.2,0.0,0.0,Medallion Leasin,Mobile,Lincoln Park,Lake View,2024-06-22,2024,6,22,6,True
3,5ebc6ad427853d7b1e482dbaa2956a5057b2a0b3,0300862a19483bcacac6176544abf331b8a3d41ec6efda...,2024-06-22 23:45:00,2024-06-22T23:45:00.000,349,0.91,10.0,3.59,0.0,0.0,13.59,41.899602,-87.633308,41.878866,-87.625192,2024-06-22 23:00:00,34.1,33.2,0.0,0.0,Sun Taxi,Mobile,Near North Side,Loop,2024-06-22,2024,6,22,6,True
4,5f38f8c167b8d0914ced4b81fb0ad21e3c1a5e6e,11bb28dc5075f790bd4529d80a571002aeb69fd4145015...,2024-06-22 23:45:00,2024-06-23T00:00:00.000,331,0.8,25.0,5.1,0.0,0.0,30.6,41.880994,-87.632746,41.892042,-87.631864,2024-06-22 23:00:00,34.1,33.2,0.0,0.0,City Service,Credit Card,Loop,Near North Side,2024-06-22,2024,6,22,6,True
