## Importing the necessary libraries

In [1]:
import os
from datetime import datetime

import pyarrow.dataset as ds

import pyarrow.compute as pc
import pyarrow.parquet as pq

## Configuration of AWS to get access to the dataset

In [13]:
### Installing the ASW command line interface
#pip install awscli

In [2]:
### indicating the ini file fot AWS configuration (The ini file has already been uploaded manually.)
!set AWS_SHARED_CREDENTIALS_FILE=awscli.ini
path = "awscli.ini"
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = path

### Checking the configuration to make sure all good!
!aws configure list

      Name                    Value             Type    Location
      ----                    -----             ----    --------
   profile                <not set>             None    None
access_key     ****************ATZI shared-credentials-file    
secret_key     ****************e3kB shared-credentials-file    
    region                us-west-1      config-file    ~/.aws/config


## Loading the dataset

In [3]:
%%time

dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])

CPU times: total: 375 ms
Wall time: 5.32 s


## Questions

### 3. Which time (hour) of the day has the highest tip?

In [4]:
%%time
tips_count = [0 for _ in range(24)]
tips_amount = [0 for _ in range(24)]

for fragment in dataset.get_fragments():
    table = fragment.to_table(columns=['dropoff_at', 'tip_amount'])

    table_row_size = table.to_batches()[0].num_rows
    sorted_indices = pc.sort_indices(table, sort_keys=[("dropoff_at", "ascending")])
    sorted_table = table.take(sorted_indices)

    pq.write_table(sorted_table, 'optimized_parquet_file.parquet', row_group_size=table_row_size)

    optimized_parquet_file = pq.ParquetFile('optimized_parquet_file.parquet')
    dropoff_at_col_idx = 0
    for i in range(optimized_parquet_file.num_row_groups):
        dropoff_at = optimized_parquet_file.read_row_group(i)['dropoff_at']
        tip_amount = optimized_parquet_file.read_row_group(i)['tip_amount']

        for j in range(len(dropoff_at)):

            hour = dropoff_at[j].as_py().hour
            tips_amount[hour] += tip_amount[j].as_py()
            tips_count[hour] += 1

average_hourly_tips = []
for i in range(24):
    average_hourly_tip = tips_amount[i] / tips_count[i]
    print(f'Hour {i}: {average_hourly_tip}')
    average_hourly_tips.append(average_hourly_tip)

print(f'The Hour with the highest tip is {average_hourly_tips.index(max(average_hourly_tips))}')

Hour 0: 1.432907006747444
Hour 1: 1.3652687854440366
Hour 2: 1.2802249481451455
Hour 3: 1.2390465738022327
Hour 4: 1.2722558943116409
Hour 5: 1.554707271017764
Hour 6: 1.347670535721485
Hour 7: 1.2530356272411751
Hour 8: 1.302820257153362
Hour 9: 1.3345102695842688
Hour 10: 1.260719438757078
Hour 11: 1.2291265633062165
Hour 12: 1.22455353190345
Hour 13: 1.230690982703977
Hour 14: 1.2538146552447396
Hour 15: 1.2798352445067427
Hour 16: 1.3657668892470693
Hour 17: 1.3445425991978914
Hour 18: 1.3402874965509548
Hour 19: 1.3648568126605471
Hour 20: 1.344923418332552
Hour 21: 1.3923077793321619
Hour 22: 1.4236391297986528
Hour 23: 1.4367369366755929
The Hour with the highest tip is 5
CPU times: total: 58min 1s
Wall time: 1h 30min 46s
