### This script contains:

#### 1. Import libraries & dataset
#### 2. Data Checks
#### 3. Overviews by Genders
#### 4. Overviews by User Types

## 1. Import libraries & dataset

In [1]:
import quandl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm # Using .api imports the public access version of statsmodels, which is a library that handles 
# statistical models.
import os
import warnings # This is a library that handles warnings.

warnings.filterwarnings("ignore") # Disable deprecation warnings that could indicate, for instance, a suspended library or 
# feature.

plt.style.use('fivethirtyeight')

In [2]:
#Ensuring that the graphs are displayed within the notebook without the need to "call" them specifically.
%matplotlib inline

In [3]:
#Folder path into usable string
path = r'C:\Users\willm\Dropbox\1 Data Analytics Course\1 New York Citibike Hire'

In [4]:
#Set Pandas to show all columns
pd.set_option("display.max_columns", None)

In [5]:
#Import Hire Dataset
NYB2020_AgeGender = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_1day_Script2.pkl'))

## 2. Data Checks

In [6]:
NYB2020_AgeGender.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19487603 entries, 0 to 19506856
Data columns (total 30 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start_station_id         int16  
 4   start_station_name       object 
 5   start_station_latitude   float64
 6   start_station_longitude  float64
 7   end_station_id           int16  
 8   end_station_name         object 
 9   end_station_latitude     float64
 10  end_station_longitude    float64
 11  bikeid                   int32  
 12  usertype                 object 
 13  birth_year               int16  
 14  gender                   int8   
 15  start_hour               int8   
 16  start_date2              object 
 17  temp                     int8   
 18  rain_snow                float16
 19  day_of_week              object 
 20  day_of_week_number       int8   
 21  month 

## 3. Overviews by Genders

In [16]:
#Set Pandas to show all Rows
pd.set_option("display.max_rows", None)

In [11]:
#Getting an overview of how many entries there are for each Age, by Gender --- 0=None
NYB2020_AgeGender.gender.eq(0).astype(int).groupby(NYB2020_AgeGender.age).sum()

age
16        130
17        109
18        280
19       1096
20       3671
21       1905
22       2531
23       1773
24       6209
25       6530
26       7369
27       7767
28       9351
29       8914
30      15216
31      10772
32       6147
33       6553
34       6034
35       4334
36       6709
37       6515
38       6337
39       4394
40       7293
41       3074
42       2503
43       2143
44       3924
45       3982
46       3162
47       2268
48       2662
49       1272
50       1572
51    1966178
52       3321
53       1344
54       1933
55       1427
56       4681
57        313
58       1097
59        828
60       1092
61        364
62        104
63        807
64        420
65        391
66        400
67         37
68         38
69         30
70        639
71        220
72         39
73          0
74        165
75          0
76         18
77        235
78          0
79         14
80         23
81        244
82          0
83          0
84          0
85         65
86          0
87

In [12]:
#Getting an overview of how many entries there are for each Age, by Gender --- 1=Male
NYB2020_AgeGender.gender.eq(1).astype(int).groupby(NYB2020_AgeGender.age).sum()

age
16      5835
17     25788
18     57491
19     89472
20    131039
21    126892
22    140848
23    201547
24    293813
25    388395
26    424114
27    452824
28    501547
29    517433
30    588514
31    514989
32    492250
33    425720
34    403646
35    385112
36    346070
37    325849
38    311566
39    280147
40    268794
41    244959
42    209511
43    213757
44    189380
45    195492
46    180198
47    176511
48    174172
49    182027
50    190852
51    175129
52    154290
53    170080
54    148051
55    153742
56    158279
57    136581
58    122148
59    112056
60    118098
61     93467
62     81907
63     80841
64     73083
65     62391
66     54998
67     52637
68     30588
69     30712
70     22814
71     16444
72     16789
73     15029
74     12133
75      6568
76      4614
77      7252
78      6601
79      4645
80      3189
81       871
82      1082
83       851
84      1954
85       651
86       359
87       158
88       327
89        25
90       477
91        78
92      

In [13]:
#Getting an overview of how many entries there are for each Age, by Gender --- 2=Female
NYB2020_AgeGender.gender.eq(2).astype(int).groupby(NYB2020_AgeGender.age).sum()

age
16      2529
17      8056
18     20877
19     33856
20     50899
21     59015
22     77893
23    131080
24    191780
25    241178
26    259442
27    266709
28    292062
29    285435
30    319863
31    269461
32    245430
33    219108
34    178715
35    171330
36    145493
37    124798
38    126742
39    109385
40     98016
41     82168
42     77723
43     80047
44     76660
45     73417
46     78904
47     74119
48     70846
49     71193
50     81316
51     70719
52     72237
53     62855
54     63952
55     66385
56     53030
57     60481
58     58964
59     42758
60     45658
61     40321
62     31614
63     30975
64     24012
65     20237
66     21683
67     11465
68      7290
69      7904
70      9372
71      7583
72      5145
73      6184
74      3550
75      1871
76      3099
77      2191
78      1007
79      2234
80      2603
81       157
82      7653
83      1639
84       645
85       662
86       937
87       394
88       672
89       165
90       378
91       235
92      

## 4. Overviews by User Types

In [17]:
#Getting an overview of how many entries there are for each Age, by Subscribers
NYB2020_AgeGender.usertype.eq('Subscriber').astype(int).groupby(NYB2020_AgeGender.age).sum()

age
16      4275
17     21137
18     43165
19     66365
20    103869
21    105979
22    127065
23    207845
24    330064
25    458786
26    520856
27    560273
28    644142
29    664868
30    712689
31    654685
32    624062
33    554973
34    507268
35    491797
36    439824
37    400858
38    403528
39    356336
40    339570
41    302884
42    266582
43    273457
44    250684
45    254653
46    245910
47    238211
48    233772
49    241422
50    261467
51    361365
52    220495
53    225469
54    204259
55    212271
56    208396
57    190341
58    176571
59    150234
60    160161
61    130097
62    110607
63    109946
64     95492
65     81106
66     75558
67     63044
68     36686
69     37489
70     31857
71     23718
72     21572
73     20642
74     15526
75      8259
76      7372
77      9322
78      7561
79      6833
80      5763
81      1264
82      8717
83      2465
84      2591
85      1367
86      1248
87       544
88       952
89       181
90       841
91       235
92      

In [18]:
#Getting an overview of how many entries there are for each Age, by Customers
NYB2020_AgeGender.usertype.eq('Customer').astype(int).groupby(NYB2020_AgeGender.age).sum()

age
16       4219
17      12816
18      35483
19      58059
20      81740
21      81833
22      94207
23     126555
24     161738
25     177317
26     170069
27     167027
28     158818
29     146914
30     210904
31     140537
32     119765
33      96408
34      81127
35      68979
36      58448
37      56304
38      41117
39      37590
40      34533
41      27317
42      23155
43      22490
44      19280
45      18238
46      16354
47      14687
48      13908
49      13070
50      12273
51    1850661
52       9353
53       8810
54       9677
55       9283
56       7594
57       7034
58       5638
59       5408
60       4687
61       4055
62       3018
63       2677
64       2023
65       1913
66       1523
67       1095
68       1230
69       1157
70        968
71        529
72        401
73        571
74        322
75        180
76        359
77        356
78         47
79         60
80         52
81          8
82         18
83         25
84          8
85         11
86         48
87

In [14]:
#Resetting 'show all rows' function
pd.reset_option('display.max_rows')

<font color="blue"> <b>Using the data from the 5 queries above, I copied it and put it into Excel, imputing the Genders and User Types into a new column.</b></font>

In [28]:
#Just a count to see if the right numbers are showing in Tableau
NYB2020_AgeGender['usertype'].value_counts(dropna = False, ascending=True)

Customer       4544260
Subscriber    14943343
Name: usertype, dtype: int64