### This script contains:

#### 1. Import libraries & dataset
#### 2. Data Checks and Wrangling
#### 3. Flags to determine which Hires went over the free allowance
####  - Customer/Subscriber Subsets
####  - Exporting and re-importing the 2 datasets to get around Pythons "copy of a slice from a DataFrame" issue
####  - Over Allowance Subsets
####  - Total Minutes over allowance

## 1. Import libraries & dataset

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os

In [2]:
#Folder path into usable string
path = r'C:\Users\willm\Dropbox\1 Data Analytics Course\1 New York Citibike Hire'

In [3]:
#Import Hire Dataset
NYB2020 = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020merged.pkl'))

In [4]:
#Set Pandas to show all columns
pd.set_option("display.max_columns", None)

In [5]:
#Ensuring we see the full Latitudes and Longitudes
pd.set_option('display.precision', 10)

In [6]:
%matplotlib inline

## 2. Data Checks and Wrangling 

In [7]:
NYB2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19506857 entries, 0 to 19506856
Data columns (total 25 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start_station_id         int16  
 4   start_station_name       object 
 5   start_station_latitude   float64
 6   start_station_longitude  float64
 7   end_station_id           int16  
 8   end_station_name         object 
 9   end_station_latitude     float64
 10  end_station_longitude    float64
 11  bikeid                   int32  
 12  usertype                 object 
 13  birth_year               int16  
 14  gender                   int8   
 15  start_hour               int8   
 16  start_date2              object 
 17  temp                     int8   
 18  rain_snow                float16
 19  day_of_week              object 
 20  day_of_week_number       int8   
 21  month 

In [8]:
NYB2020['TripMins'] = NYB2020['TripMins'].astype('int32')

In [9]:
NYB2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19506857 entries, 0 to 19506856
Data columns (total 25 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start_station_id         int16  
 4   start_station_name       object 
 5   start_station_latitude   float64
 6   start_station_longitude  float64
 7   end_station_id           int16  
 8   end_station_name         object 
 9   end_station_latitude     float64
 10  end_station_longitude    float64
 11  bikeid                   int32  
 12  usertype                 object 
 13  birth_year               int16  
 14  gender                   int8   
 15  start_hour               int8   
 16  start_date2              object 
 17  temp                     int8   
 18  rain_snow                float16
 19  day_of_week              object 
 20  day_of_week_number       int8   
 21  month 

## 3. Flags to determine which Hires went over the free allowance
#### I am deliberately running Customers and Subscribers one after another, step by step.
#### this allows me to spot issues
##### - Customers get 30 mins free, per trip - Subscribers get 45 mins free, per trip.

### - Customer/Subscriber Subsets

In [11]:
NYB2020_Cust = NYB2020[NYB2020['usertype']=='Customer']

In [12]:
NYB2020_Sub = NYB2020[NYB2020['usertype']=='Subscriber']

In [13]:
#Using the original dataset to see that the new subsets numbers match up
NYB2020['usertype'].value_counts(dropna = False, ascending=True)

Customer       4551091
Subscriber    14955766
Name: usertype, dtype: int64

In [14]:
NYB2020_Cust.shape

(4551091, 25)

In [15]:
NYB2020_Sub.shape

(14955766, 25)

In [16]:
#Requesting to just view 2 columns to see the subset working
NYB2020_Cust[['usertype', 'start_date2']].tail(20)

Unnamed: 0,usertype,start_date2
19506772,Customer,2020-12-31
19506776,Customer,2020-12-31
19506778,Customer,2020-12-31
19506781,Customer,2020-12-31
19506783,Customer,2020-12-31
19506789,Customer,2020-12-31
19506790,Customer,2020-12-31
19506791,Customer,2020-12-31
19506794,Customer,2020-12-31
19506796,Customer,2020-12-31


### - Exporting and re-importing the 2 datasets to get around Pythons "copy of a slice from a DataFrame" issue.

In [19]:
#Exporting the Customers
NYB2020_Cust.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_Cust.pkl'))

In [20]:
#Importing the Customers dataset
NYB2020_Custs = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_Cust.pkl'))

In [22]:
#Exporting the Subscribers
NYB2020_Sub.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_Sub.pkl'))

In [23]:
#Importing the Subscribers dataset
NYB2020_Subs = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_Sub.pkl'))

### - Over Allowance Subsets

In [24]:
#Creating a new column to display one of two flags, depending on the length of hire
NYB2020_Custs.loc[NYB2020_Custs['TripMins'] > 30, 'Over_Allowance'] = 'Yes'
NYB2020_Custs.loc[NYB2020_Custs['TripMins'] <= 30, 'Over_Allowance'] = 'No'

In [25]:
#Creating a new column to display one of two flags, depending on the length of hire
NYB2020_Subs.loc[NYB2020_Subs['TripMins'] > 45, 'Over_Allowance'] = 'Yes'
NYB2020_Subs.loc[NYB2020_Subs['TripMins'] <= 45, 'Over_Allowance'] = 'No'

In [26]:
NYB2020_Custs2 = NYB2020_Custs[NYB2020_Custs['Over_Allowance']=='Yes']

In [27]:
NYB2020_Custs2.count()

tripduration               1140935
starttime                  1140935
stoptime                   1140935
start_station_id           1140935
start_station_name         1140935
start_station_latitude     1140935
start_station_longitude    1140935
end_station_id             1140935
end_station_name           1140935
end_station_latitude       1140935
end_station_longitude      1140935
bikeid                     1140935
usertype                   1140935
birth_year                 1140935
gender                     1140935
start_hour                 1140935
start_date2                1140935
temp                       1140935
rain_snow                  1140935
day_of_week                1140935
day_of_week_number         1140935
month                      1140935
month_number               1140935
TripMins                   1140935
age                        1140935
Over_Allowance             1140935
dtype: int64

In [28]:
NYB2020_Subs2 = NYB2020_Subs[NYB2020_Subs['Over_Allowance']=='Yes']

In [29]:
NYB2020_Subs2.count()

tripduration               389467
starttime                  389467
stoptime                   389467
start_station_id           389467
start_station_name         389467
start_station_latitude     389467
start_station_longitude    389467
end_station_id             389467
end_station_name           389467
end_station_latitude       389467
end_station_longitude      389467
bikeid                     389467
usertype                   389467
birth_year                 389467
gender                     389467
start_hour                 389467
start_date2                389467
temp                       389467
rain_snow                  389467
day_of_week                389467
day_of_week_number         389467
month                      389467
month_number               389467
TripMins                   389467
age                        389467
Over_Allowance             389467
dtype: int64

### - Total Minutes over allowance

In [30]:
NYB2020_Custs2['TripMins'].sum()

120403993

In [31]:
NYB2020_Subs2['TripMins'].sum()

47315601

Adding these 2 numbers together will get us the total number of minutes Users have used over their allowance
Multiple that by 15 cent and we get the total ammount of 'extra' money Citibike took in.

In [None]:
167,719,594 * .15 = 25,157,939.10