In [1]:
import pandas as pd

df = pd.read_parquet('Data/yellow_tripdata_2023-01.parquet', engine ='pyarrow')

### Change columns to datetime

In [2]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
print(f"Tpep_pickup_datetime was successfully change to datetime")

Tpep_pickup_datetime was successfully change to datetime


In [3]:
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
print(f"Tpep_dropoff_datetime was successfully change to datetime")

Tpep_dropoff_datetime was successfully change to datetime


### Add 3 columns,trip_duration_minutes, pickup_hour, day_of_week

In [4]:
df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
print(f"Trip duration in minutes was successfully added")

df['pickup_hour'] = df['tpep_pickup_datetime'].dt.time

df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

print(f'Successfully added three columns')


Trip duration in minutes was successfully added
Successfully added three columns


### Filter routes that are shorter than 1 minute, distance is <= 0.5mil and with fare_amount < 0


In [None]:
df = df.loc[
    (df['trip_duration_minutes'] >= 1) &
    (df['trip_distance'] > 0.5) &
    (df['fare_amount'] >= 0)
]
df.to_parquet('Data/aggregated_trips.parquet', engine = 'pyarrow', index=False)
print(f"Aggregated data was saved to parquet")
print(f'New df with filtered routes was created ')

Aggregated data was saved to parquet
New df with filtered routes was created 


### TOP 5 Hours in day with the highest average passanger count

In [44]:
df['full_hours'] = df['tpep_pickup_datetime'].dt.floor('H').dt.time

df = (
    df.groupby(['full_hours'])
    .agg(avg_passenger = ('passenger_count', 'mean'))
    .sort_values('avg_passenger', ascending=False)
    .head(5)
)

df

Unnamed: 0_level_0,avg_passenger
full_hours,Unnamed: 1_level_1
02:00:00,1.443666
01:00:00,1.438017
00:00:00,1.426085
03:00:00,1.416356
22:00:00,1.411559


### Count average route time and average distance time for day of week

In [None]:
df = df.groupby(['day_of_week']).agg(
    avg_time = ('trip_duration_minutes', 'mean'),
    avg_distance = ('trip_distance', 'mean')
)
print(f'Average time and distance for day of week was successfully created')
df

Unnamed: 0_level_0,avg_time,avg_distance
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
Friday,16.611413,3.886419
Monday,16.006763,4.761626
Saturday,15.64733,3.90658
Sunday,15.818381,4.745719
Thursday,16.991849,3.73109
Tuesday,16.331239,3.936898
Wednesday,16.31654,3.481708


### Upload to AWS

In [7]:
import boto3

s3 = boto3.client('s3')

bucket_name= 'mypracawsbucketsc2'
file_path ='Data/aggregated_trips.parquet'
s3_key= 'processed/aggregated_trips_task_6.parquet'

s3.upload_file(file_path, bucket_name, s3_key)
print("Upload completed.")

Upload completed.
