In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import collections

In [2]:
# Loading the dataset
df = pd.read_csv('202107-citibike-tripdata.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.drop(columns=['ride_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng'], inplace=True)

In [4]:
# mofify some categorical columns

df['member_casual'] = df['member_casual'].astype('category')     
df['member_casual'] = df['member_casual'].cat.codes             # 1 - member, 0 - casual

print('member type:', pd.Series(df['member_casual']).unique())

member type: [1 0]


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3084537 entries, 0 to 3084536
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   rideable_type       object 
 1   started_at          object 
 2   ended_at            object 
 3   start_station_name  object 
 4   start_station_id    float64
 5   end_station_name    object 
 6   end_station_id      object 
 7   member_casual       int8   
dtypes: float64(1), int8(1), object(6)
memory usage: 167.7+ MB


### Warm Up Questions

#### 1. compute and plot the duration of each ride in minutes

In [6]:
df['ended_at'] = pd.to_datetime(df['ended_at'])
df['started_at'] = pd.to_datetime(df['started_at'])

df['Ride_dur'] = df['ended_at'] - df['started_at']

In [7]:
df['Ride_dur_min'] = df['Ride_dur'] / np.timedelta64(1, 'm')
df['Ride_dur_min'].describe().round(2)

count    3084537.00
mean          18.02
std           83.16
min          -10.17
25%            6.25
50%           11.20
75%           20.00
max        48872.78
Name: Ride_dur_min, dtype: float64

In [8]:
print("Variance of the ride duration is:", df['Ride_dur_min'].var())

Variance of the ride duration is: 6915.028501301067


In [9]:
#  # plotting histogram plot
# %matplotlib inline            
# import matplotlib.pyplot as plt
# plt.figure(figsize=(31,10))

# sns.countplot(data=df, x="Ride_dur_min") 

#### 2. ride duration analysis

In [10]:
gr20 = df['Ride_dur_min'][df['Ride_dur_min'] > 20].count() 
prob20 = gr20 / len(df)
print ("The probability of ride duration greater than 20 is:", prob20)

The probability of ride duration greater than 20 is: 0.24975125926516686


#### 3. P(greater than 20|member)

In [11]:
n_member = df['member_casual'][df['member_casual'] == 1].count()
gr20_member = df['member_casual'][df['Ride_dur_min'] > 20].count()

cond_prob20 =  gr20_member / n_member
print ("The conditional probability of ride duration greater than 20 for members is:", cond_prob20)

The conditional probability of ride duration greater than 20 for members is: 0.3811424373084373


#### 4. P(member|duration>25)

In [12]:
gr25 = df['Ride_dur_min'][df['Ride_dur_min'] > 25].count()
gr25_member = df['member_casual'][df['Ride_dur_min'] > 25][df['member_casual'] == 1].count()

cond_prob25 =  gr25_member / gr25
print ("The conditional probability of members given the ride duration greater than 25 is:", cond_prob25)

The conditional probability of members given the ride duration greater than 25 is: 0.4828191891922421


### Project

#### manipulate time

In [13]:
# identify the start hour and weekday

df['start hour'] = df['started_at'].dt.hour.astype('int')
df['start min'] = df['started_at'].dt.minute.astype('int')
df['weekday'] = df['started_at'].dt.weekday.astype('category')

In [14]:
# delete weekends

indexNames = df[df['weekday'].isin([5, 6])].index
df.drop(indexNames, inplace=True)

In [15]:
# use start hour to represent the block of the day
# morning -> 1 and afternoon -> 0

df['mor/aft'] = np.where(df['start hour'] < 12, 1, 0)

In [16]:
# split morning/afternoon into blocks based on 10 min

df['period'] = df.apply(lambda row: row['start hour'] % 12 * 6 + row['start min']//10 + 1, axis=1)

#### manipulate stations

In [18]:
# select three most popular stations based on the frequency of bike going out

counter = collections.Counter(df["start_station_name"])
most_common_stations = [each[0] for each in counter.most_common(3)]

most_common_ids = [df.loc[df['start_station_name'] == each, 'start_station_id'].iloc[0] for each in most_common_stations]
most_common_stations, most_common_ids

(['W 21 St & 6 Ave', 'E 17 St & Broadway', 'West St & Chambers St'],
 [6140.05, 5980.07, 5329.03])

In [19]:
for i, id in enumerate(most_common_ids):
    df.loc[df['start_station_id'] == id, 'start_station_id'] = i
    df.loc[df['end_station_id'] == id, 'end_station_id'] = i 

# 'W 21 St & 6 Ave' -> 0
# 'E 17 St & Broadway' -> 1
# '7 Ave & Central Park South' -> 2

In [20]:
# keep only the top 3 stations
df = df[ ((df['start_station_id'] == 0) | (df['end_station_id'] == 0))
                | ((df['start_station_id'] == 1) | (df['end_station_id'] == 1))
                | ((df['start_station_id'] == 2) | (df['end_station_id'] == 2)) ]


#### remove duration outliers

In [21]:
std, mean = df['Ride_dur_min'].describe()['std'], df['Ride_dur_min'].describe()['mean']

In [22]:
df = df[df['Ride_dur_min'] <= mean + 2 * std]

#### calculate freq

In [23]:
def LB(MA, id, numOfBikes, capacity):
    # MA: morning (1) or afternoon (0)
    # id: station id 0, 1, 2
    freq_matrix = np.zeros([capacity+1, capacity+1])
    
    pre_bikes = numOfBikes
    for i in range(72):
        ave_leaving = df[ (df['period'] == i) & (df['start_station_id'] == id) & (df['mor/aft'] == MA) ].shape[0] // 22
        ave_arriving = df[ (df['period' ] == i) & (df['end_station_id'] == id) & (df['mor/aft'] == MA) ].shape[0] // 22
        cur_bikes = pre_bikes + ave_arriving - ave_leaving
        
        if cur_bikes < 0:
            cur_bikes = 0
        elif cur_bikes > capacity:
            cur_bikes = capacity
        
        freq_matrix[pre_bikes][cur_bikes] += 1
        pre_bikes = cur_bikes
        # print("Period %d -> %d left, %d arrived, %d available at %d." % (i, ave_leaving, ave_arriving, cur_bikes, id))
   
    return freq_matrix


In [24]:
def describe(m):
    for i, row in enumerate(m):
        for j, col in enumerate(row):
            if col > 0:
                print('The freq from %d to %d is %d' % (i, j, col))
                

In [25]:
cap = [50, 66, 39]

In [26]:
fm00 = LB(0, 0, cap[0], cap[0])

In [27]:
fm10 = LB(1, 0, cap[0], cap[0])

In [28]:
fm01 = LB(0, 1, cap[1], cap[1])

In [29]:
fm11 = LB(1, 1, 0, cap[1])

In [30]:
fm02 = LB(0, 2, cap[2], cap[2])

In [31]:
fm12 = LB(1, 2, 0, cap[2])

In [32]:
fm = [fm00, fm10, fm01, fm11, fm02, fm12]

#### freq 2 prob

In [33]:
def freq2prob(freq_matrix):
    # convert frequencies in each matrix to transition probabilities
    re = np.zeros(freq_matrix.shape)
    for i, row in enumerate(freq_matrix):
        if sum(row) > 0:    
            for j, col in enumerate(row):
                re[i][j] = col / sum(row)

    return re

In [34]:
pm = []

In [35]:
for each in fm:
    pm.append(freq2prob(each))

 #### stationary probability

In [36]:
def staionary_prob(transition_prob, control):
    # control 0 -> start with empty station
    # control -1 -> start with full station
    s = transition_prob.shape[0]   
    initial_state = np.zeros([s,])
    initial_state[control] = 1

    pmn = np.linalg.matrix_power(transition_prob, 1000)
    pi = np.matmul(initial_state, pmn)
    
    return pi


In [37]:
sp = []
sp.append(staionary_prob(pm[0], -1))
sp.append(staionary_prob(pm[1], -1))
sp.append(staionary_prob(pm[2], -1))
sp.append(staionary_prob(pm[3], 0))
sp.append(staionary_prob(pm[4], -1))
sp.append(staionary_prob(pm[5], 0))

In [38]:
for i, each in enumerate(sp):
    # print("Stationary prob for %d is:" %i, each)
    print("The non-zero stationary probability for %d is:" %i)
    for j, p in enumerate(each):
        if round(p, 4) > 0:
            print("\tstate %d with prob %.5f" % (j, p))
    print("Sum of stationary prob: %.2f" % sum(each))
    print('\n')


The non-zero stationary probability for 0 is:
	state 0 with prob 1.00000
Sum of stationary prob: 1.00


The non-zero stationary probability for 1 is:
	state 0 with prob 1.00000
Sum of stationary prob: 1.00


The non-zero stationary probability for 2 is:
	state 12 with prob 1.00000
Sum of stationary prob: 1.00


The non-zero stationary probability for 3 is:
	state 42 with prob 0.57143
	state 43 with prob 0.14286
	state 44 with prob 0.28571
Sum of stationary prob: 1.00


The non-zero stationary probability for 4 is:
	state 0 with prob 0.93750
	state 1 with prob 0.06250
Sum of stationary prob: 1.00


The non-zero stationary probability for 5 is:
	state 0 with prob 0.72222
	state 1 with prob 0.04167
	state 2 with prob 0.01389
	state 3 with prob 0.04167
	state 4 with prob 0.02778
	state 5 with prob 0.08333
	state 6 with prob 0.06944
Sum of stationary prob: 1.00


