In [None]:

# A bike hire scheme consists of a number of bike hire stations from which bikes can be rented.
# A CSV report (no headers, no specified sort order) can be produced containing the history of bike
# movements over a specified period.

# FILEPATH = "bike.csv"

# File Format:
# Station ID Integer, representing the bike hire station.

# Bike ID Integer, representing the bike itself.

# Arrival Datetime Datetime in format YYYYMMDDThh:mm:ss. Representing the date/time the
# bike arrived at the station. It is empty if the bike was at this station at the start
# of the reporting period.

# Departure Datetime Datetime in format YYYYMMDDThh:mm:ss. Representing the date/time the
# bike departed from the station. It is empty if the bike was at this station at the
# end of the reporting period.

# Example Line 1:
# Bike 102 was docked (arrived) at station 22 at 2015-03-04 13:04 and was rented out again (departed)
# at 2015-03-04 13:25:32
        
#         22,102,20150304T13:04:00,20150304T13:25:32
                
# Example Line 2:
# Bike 34 was already at station 4 at the start of the reporting period, and was first rented out at 2015-
# 03-01 05:15:08

#         4,34,,20150301T05:15:08




import pandas as pd
import numpy as np
import datetime

df = pd.read_csv('bikes.csv', names=['station_id','bike_id','arrival_time','departure_time'])

#Convert columns to datetime
df['arrival_time'] = pd.to_datetime(df['arrival_time'], format='%Y%m%dT%H:%M:%S')
df['departure_time'] = pd.to_datetime(df['departure_time'], format='%Y%m%dT%H:%M:%S')

#Unique bike ids
distinct_bikes_id = df['bike_id'].unique()

#Sorting values by arrival_time
df.sort_values('arrival_time',inplace=True, na_position='first')

def calculate_mean_journey_duration(df):
    #For each bike_id create a dataframe with the respective data and apply interval calculus
    for bike in distinct_bikes_id:
        sub_df = df[df['bike_id'] == bike]
        sub_df['departure_time_shifted'] = sub_df['departure_time'].shift(1)
        duration =  sub_df['arrival_time'] - sub_df['departure_time_shifted']
                
        for i in sub_df.index:

            df.loc[i, 'journey_duration[s]'] = duration[i].total_seconds() #Conversion from interval to seconds
    
    mean_duration = df['journey_duration[s]'].mean() #Mean duration calculus
    
    print ('Mean journey duration, across all bikes and stations is ', str(datetime.timedelta(seconds=mean_duration)))
    
# %timeit calculate_mean_journey_duration(df)     

calculate_mean_journey_duration(df)