In [1]:
from io import StringIO
import os

import boto3
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 50)

In [2]:
aws_acces_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [3]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an S3 bucket.
    
    Parameters
    ----------
    bucket : str 
        The bucket where the files at.
        
    path : str
        The folders to the file.
        
    filename : str
        Name of the file.
        
    Returns
    ------
    pd.DataFrame
        A DataFrame of the downloaded file.
    
    """
    
    s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key) 
    
    full_path = f"{path}{filename}"
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

#### Prepare the folders

In [None]:
s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key) 
bucket = "<your-bucket-name>"

community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"

#### Read files

In [5]:
community_areas = read_csv_from_s3(bucket, community_areas_path, "community_areas_master.csv")
company = read_csv_from_s3(bucket, company_path, "company_master.csv")
date = read_csv_from_s3(bucket, date_path, "date_dimension.csv")
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename="payment_type_master.csv")

In [6]:
trips_list = []
weather_list = []

In [13]:
for file in s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)["Contents"]:
    taxi_trip_key = file["Key"]
    
    if taxi_trip_key.split("/")[-1].strip() != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename = taxi_trip_key.split("/")[-1]
            if filename > "taxi_2024-04-20.csv":
                trip = read_csv_from_s3(bucket, taxi_trips_path, filename)

                trips_list.append(trip)
                print(f"{filename} has been added.")

taxi_2024-04-21.csv has been added.
taxi_2024-04-22.csv has been added.
taxi_2024-04-23.csv has been added.
taxi_2024-04-24.csv has been added.
taxi_2024-04-25.csv has been added.
taxi_2024-04-26.csv has been added.
taxi_2024-04-27.csv has been added.
taxi_2024-04-28.csv has been added.
taxi_2024-04-29.csv has been added.
taxi_2024-04-30.csv has been added.
taxi_2024-05-01.csv has been added.
taxi_2024-05-02.csv has been added.
taxi_2024-05-03.csv has been added.
taxi_2024-05-04.csv has been added.
taxi_2024-05-05.csv has been added.
taxi_2024-05-06.csv has been added.
taxi_2024-05-07.csv has been added.
taxi_2024-05-08.csv has been added.
taxi_2024-05-09.csv has been added.
taxi_2024-05-10.csv has been added.
taxi_2024-05-11.csv has been added.
taxi_2024-05-12.csv has been added.
taxi_2024-05-13.csv has been added.
taxi_2024-05-14.csv has been added.
taxi_2024-05-15.csv has been added.
taxi_2024-05-16.csv has been added.
taxi_2024-05-17.csv has been added.
taxi_2024-05-18.csv has been

In [14]:
trips = pd.concat(trips_list, ignore_index=True)

In [15]:
trips.shape

(1184868, 20)

In [16]:
# trips.to_csv("taxi_trips.csv")

In [17]:
for file in s3.list_objects(Bucket=bucket, Prefix=weather_path)["Contents"]:
    weather_key = file["Key"]
    
    if weather_key.split("/")[-1].strip() != "":
        if weather_key.split(".")[1] == "csv":
            
            filename = weather_key.split("/")[-1]
            if filename > "weather_2024-04-20.csv":
                weather_daily = read_csv_from_s3(bucket, weather_path, filename)

                weather_list.append(weather_daily)
                print(f"{filename} has been added.")  

weather_2024-04-21.csv has been added.
weather_2024-04-22.csv has been added.
weather_2024-04-23.csv has been added.
weather_2024-04-24.csv has been added.
weather_2024-04-25.csv has been added.
weather_2024-04-26.csv has been added.
weather_2024-04-27.csv has been added.
weather_2024-04-28.csv has been added.
weather_2024-04-29.csv has been added.
weather_2024-04-30.csv has been added.
weather_2024-05-01.csv has been added.
weather_2024-05-02.csv has been added.
weather_2024-05-03.csv has been added.
weather_2024-05-04.csv has been added.
weather_2024-05-05.csv has been added.
weather_2024-05-06.csv has been added.
weather_2024-05-07.csv has been added.
weather_2024-05-08.csv has been added.
weather_2024-05-09.csv has been added.
weather_2024-05-10.csv has been added.
weather_2024-05-11.csv has been added.
weather_2024-05-12.csv has been added.
weather_2024-05-13.csv has been added.
weather_2024-05-14.csv has been added.
weather_2024-05-15.csv has been added.
weather_2024-05-16.csv ha

In [18]:
weather = pd.concat(weather_list, ignore_index=True)

In [19]:
weather.shape

(1584, 5)

In [20]:
# weather.to_csv("weather.csv")

#### Join them together

In [None]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on="datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [None]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [None]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [None]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [None]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [None]:
date['date'] = pd.to_datetime(date['date'])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] = pd.to_datetime(trips_full["trip_start_date"])

In [None]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on="date", how="inner")
trips_full = trips_full.drop(columns=["date"])

In [None]:
trips_full.head()

#### Visualisations

In [None]:
# pip install seaborn

In [None]:
# 1 - Histogram of Trip Durations

sns.histplot(trips_full['trip_seconds'], bins=30, kde=False)
plt.title('Histogram of Trip Durations')
plt.xlabel('Trip Duration (seconds)')
plt.ylabel('Count')
plt.xlim(0, 5000)  # Adjust the x-axis limits for better visibility, clear the outliers
plt.show()

In [None]:
# 2 - Scatter Plot of Trip Miles vs. Fare

sns.scatterplot(x='trip_miles', y='fare', data=trips_full)
plt.title('Scatter Plot of Trip Miles vs. Fare')
plt.xlabel('Trip Miles')
plt.ylabel('Fare')
plt.xlim(0, 30)  # Adjust the x-axis limits for better visibility, clear the outliers
plt.show()

In [None]:
# 3 - Average Fare by Day of the Week

trips_full['day_of_week'] = trips_full['trip_start_timestamp'].dt.day_name()
sns.barplot(x='day_of_week', y='fare', data=trips_full, ci=None)
plt.title('Average Fare by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Fare')
plt.show()

In [None]:
# 4 - Count Plot of Payment Types

sns.countplot(x='payment_type', data=trips_full)
plt.title('Count Plot of Payment Types')
plt.xlabel('Payment Type')
plt.ylabel('Count')
plt.show()


In [None]:
# 5 - Average Trip Miles by Payment Type

sns.barplot(x='payment_type', y='trip_miles', data=trips_full)
plt.title('Average Trip Miles by Payment Type')
plt.xlabel('Payment Type')
plt.ylabel('Average Trip Miles')
plt.show()

In [None]:
# 6 - Box Plot of Trip Distances by Pickup Community Area

plt.figure(figsize=(14, 8))
sns.boxplot(x='pickup_community_area_name', y='trip_miles', data=trips_full, order=trips_full['pickup_community_area_name'].value_counts().index)
plt.title('Box Plot of Trip Distances by Pickup Community Area')
plt.xlabel('Pickup Community Area')
plt.ylabel('Trip Miles')
plt.xticks(rotation=90)
plt.show()