In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
dfs = pd.read_csv('stations.csv')
dft = pd.read_csv('trips.csv')

# Q1. What was the duration of the longest trip? What was the average trip duration? What was the smallest trip duration?

In [3]:
q1_max = dft['duration'].agg('max')
q1_min = dft['duration'].agg('min')
q1_avg = dft['duration'].agg('mean') 
print(f'Max = {q1_max}, Min = {q1_min}, Avg = {q1_avg}')

Max = 9999, Min = 0, Avg = 912.4096819046612


# Q2. How many trips were taken by 'Registered' users?

In [4]:
q2 = dft[dft['sub_type'] == 'Registered']['id'].agg('count')
q2

1105192

# Q3. How many trips were taken by male users in comparison to Female users? for Registered users only

In [5]:
q3 = dft[dft['sub_type'] == 'Registered'].groupby('gender')['id'].agg('count')
q3

gender
Female      271333
Female\n         1
Male        833858
Name: id, dtype: int64

# Q4. Do registered or casual users take longer trips?

In [6]:
q4 = dft.groupby('sub_type')['duration'].agg('mean')
q4

sub_type
Casual        1519.643897
Registered     657.026067
Name: duration, dtype: float64

# Q5. Which bike was used for the most trips?

In [7]:
q5 = dft.groupby('bike_number')['id'].agg('count').sort_values(ascending=False)
q5.head(1)

bike_number
B00490    2120
Name: id, dtype: int64

# Q6. What is the average duration of trips by users over the age of 30?

In [8]:
now = pd.to_datetime('now').year
q6_0 = dft.copy()
q6_0['age'] = now - q6_0['birth_date']
q6 = q6_0[ (q6_0['sub_type'] == 'Registered') &  (q6_0['age'] > 30)]['duration'].agg('mean')
q6

659.6764337335354

# Q7. Which stations are most frequently used for round trips?

In [9]:
q7 = dft.merge(dfs, left_on = 'start_station', right_on = 'id')
q7 = q7[ q7['start_station'] == q7['end_station']].groupby('station')['station'].agg('count').sort_values(ascending = False)
q7.head(5)

station
The Esplanade - Beacon St. at Arlington St.      3064
Charles Circle - Charles St. at Cambridge St.    2739
Boston Public Library - 700 Boylston St.         2548
Boylston St. at Arlington St.                    2163
Beacon St / Mass Ave                             2144
Name: station, dtype: int64

# Q8. How many trips start and end in different municipalities? Station Table has the Municipality Attribute

In [10]:
q8 = dft.merge(dfs, left_on = 'start_station', right_on = 'id')
q8 = q8.merge(dfs, left_on = 'end_station', right_on = 'id')
q8 = q8[ q8['municipality_x'] !=  q8['municipality_y']]['station_x'].agg('count')
q8

309748

# Q9. How many trips incurred additional fees (lasted longer than 30 minutes)? 

In [11]:
q9 = dft[ dft['duration']/60 > 30]['id'].agg('count')
q9

123155

# Q10. Which bike was used for the longest total time? Provide the answer in Hours

In [12]:
q10 = dft.groupby('bike_number')['duration'].agg('sum').sort_values(ascending = False)
#q10['duration'] = q10['duration']/ 3600
(q10/3600).head(1)

bike_number
B00490    571.845278
Name: duration, dtype: float64

# Q11. Did registered or casual users take more round trips? A Round Trip is considered as Same Start and End Destination

In [13]:
q11 = dft[ dft['start_station'] == dft['end_station'] ].groupby('sub_type')['id'].agg('count')
q11

sub_type
Casual        41427
Registered    31635
Name: id, dtype: int64

# Q12. Which municipality had the most frequent Station End?

In [14]:
q12 = dft.merge(dfs, left_on = 'end_station', right_on = 'id')
q12 = q12.groupby('municipality')['id_x'].agg('count')
q12

municipality
Boston        1212364
Brookline       20677
Cambridge      297011
Somerville      39921
Name: id_x, dtype: int64

# Q13. Which From and To Routes are Frequent with Regards to Municipalities?

In [15]:
q13 = dft.merge(dfs, left_on = 'start_station', right_on = 'id')
q13 = q13.merge(dfs, left_on = 'end_station', right_on = 'id')
q13 = q13.groupby(['municipality_x', 'municipality_y'])['id_x'].agg('count').sort_values(ascending = False)
q13.head(5)

municipality_x  municipality_y
Boston          Boston            1081805
Cambridge       Cambridge          162538
Boston          Cambridge          110968
Cambridge       Boston             110078
                Somerville          20998
Name: id_x, dtype: int64

# Q14. Which Age Band has the most Rides? Calculate Age Bands based on interval of 10 and Cut Off by 70 or Above

In [16]:
def age_band(x):
    if x < 20:
        return '0 - 19'
    elif (x >= 20) and (x < 30):
        return '20-29'
    elif (x >= 30)  and (x< 40):
        return '30-39'
    elif (x >= 40) and (x < 50):
        return '40-49'
    elif (x >= 50) and (x < 60):
        return '50-59'
    elif (x >= 60) and (x < 70):
        return '70-69'
    elif (x >= 70) and (x < 130):
        return '70+'
    else:
        return 'Age Not Mentioned'

In [17]:
q14_0 = dft.copy()
now = pd.to_datetime('now').year
q14_0['age'] = now - q14_0['birth_date']
q14_0['age_band'] = q14_0['age'].map(age_band)
q14 = q14_0.groupby('age_band')['id'].agg('count').sort_values(ascending = False)
q14

age_band
Age Not Mentioned    1219985
30-39                 127764
40-49                 107600
50-59                  60527
70-69                  40883
70+                    10338
20-29                   2904
Name: id, dtype: int64

# Q15. What Day of the Week is most Popular for Rides? What Time (24 Hrs Format) is Popular on that Particular Date?

In [18]:
q15 = dft.copy()
q15['day_of_week'] = pd.DatetimeIndex(pd.to_datetime(q15['start_date'])).dayofweek
q15 = q15.replace({'day_of_week': {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}})
q15_1 = q15.groupby('day_of_week')['id'].agg('count').sort_values(ascending = False)
q15_1

day_of_week
Wednesday    237957
Thursday     233993
Tuesday      230931
Friday       230205
Monday       229302
Saturday     213708
Sunday       193905
Name: id, dtype: int64

In [19]:
q15['hour'] = pd.DatetimeIndex(pd.to_datetime(q15['start_date'])).hour 
q15.head(5)
q15_2 = q15[q15['day_of_week'] == q15_1.head(1).index[0]].groupby('hour')['id'].agg('count').sort_values(ascending = False)
[q15_1.head(1).index[0], q15_2.head(1)]

['Wednesday',
 hour
 17    29582
 Name: id, dtype: int64]

In [20]:
#q15_3 = q15_1.head(1).index[0]
#q15_3