<a id='sec0'></a>
# Gender Type Distribution in the dataset used for classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split#, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from collections import Counter

import my_scoring_functions as msf

%matplotlib inline

# Setting up Data (This time tried to label subscriber as 0)

In [2]:
d2013 = pd.read_csv('../data/for_predictions/2013_membership_pred.csv')
d2014 = pd.read_csv('../data/for_predictions/2014_membership_pred.csv')
d2015 = pd.read_csv('../data/for_predictions/2015_membership_pred.csv')
d2016 = pd.read_csv('../data/for_predictions/2016_membership_pred.csv')
d2017 = pd.read_csv('../data/for_predictions/2017_membership_pred.csv')

In [3]:
all_data = [d2013, d2014, d2015, d2016, d2017]

In [4]:
# Get the overall fraction of the non-subscribers in the whole dataset
total = 0
total_nonsub = 0
for i in range(len(all_data)):
    year = i + 2013
    d = all_data[i]
    nonsubs_num = len(d[d['usertype'] == 'Customer'])
    total += len(d)
    total_nonsub += nonsubs_num
    print('# Non-subscriber Entries in %d = %d' % (year, nonsubs_num))
print('Total # Non-subscriber Entries = %d (out of %d)' % (total_nonsub, total))
print('%% of Non-subscriber Entries = %f' % (100 * total_nonsub / total))

# Non-subscriber Entries in 2013 = 666940
# Non-subscriber Entries in 2014 = 793495
# Non-subscriber Entries in 2015 = 1311331
# Non-subscriber Entries in 2016 = 1508073
# Non-subscriber Entries in 2017 = 92711
Total # Non-subscriber Entries = 4372550 (out of 39096233)
% of Non-subscriber Entries = 11.184070


In [5]:
all_data = None

In [6]:
# create a dataset for the analysis
data = pd.concat([d2013, d2014, d2015, d2016, d2017], axis=0)
data = data.dropna()
#del data['gender']

In [7]:
# this time birth year was removed
data.columns

Index(['tripduration', 'start station latitude', 'start station longitude',
       'end station latitude', 'end station longitude', 'start_year',
       'start_month', 'start_day', 'start_hour', 'start_dayofweek', 'gender',
       'usertype'],
      dtype='object')

In [8]:
data.head()

Unnamed: 0,tripduration,start station latitude,start station longitude,end station latitude,end station longitude,start_year,start_month,start_day,start_hour,start_dayofweek,gender,usertype
0,634,40.753231,-73.970325,40.732219,-73.981656,2013,7,1,0,0,0,Customer
1,1547,40.749718,-74.00295,40.746745,-74.007756,2013,7,1,0,0,0,Customer
2,178,40.730287,-73.990765,40.730473,-73.986724,2013,7,1,0,0,2,Subscriber
3,1580,40.718939,-73.992663,40.769155,-73.981918,2013,7,1,0,0,0,Customer
4,757,40.734927,-73.992005,40.720664,-73.98518,2013,7,1,0,0,1,Subscriber


In [9]:
data.describe().round(0)

Unnamed: 0,tripduration,start station latitude,start station longitude,end station latitude,end station longitude,start_year,start_month,start_day,start_hour,start_dayofweek,gender
count,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0,39096233.0
mean,920.0,41.0,-74.0,41.0,-74.0,2015.0,7.0,16.0,14.0,3.0,1.0
std,7616.0,0.0,0.0,0.0,0.0,1.0,3.0,9.0,5.0,2.0,1.0
min,60.0,0.0,-74.0,0.0,-74.0,2013.0,1.0,1.0,0.0,0.0,0.0
25%,385.0,41.0,-74.0,41.0,-74.0,2014.0,5.0,8.0,10.0,1.0,1.0
50%,623.0,41.0,-74.0,41.0,-74.0,2015.0,8.0,16.0,15.0,3.0,1.0
75%,1042.0,41.0,-74.0,41.0,-74.0,2016.0,10.0,23.0,18.0,4.0,1.0
max,8933552.0,41.0,0.0,41.0,0.0,2017.0,12.0,31.0,23.0,6.0,2.0


In [11]:
customer = data[data.usertype == 'Customer']
subs = data[data.usertype == 'Subscriber']

In [14]:
customer.groupby('gender').size()

gender
0    4337379
1      21332
2      13839
dtype: int64

In [15]:
subs.groupby('gender').size()

gender
0      186815
1    26412258
2     8124610
dtype: int64

<b>This confirms that most non-subscriber data do not contain gender information and therefore it becomes the highest predictor.</b>

In [16]:
# Calculate and print the gender distribution for subscribers and non-subscribers
subs_gender = [186815, 26412258, 8124610]
cust_gender = [4337379, 21332, 13839]

subs_total = np.sum(subs_gender)
cust_total = np.sum(cust_gender)
print('==== Gender Distribution of Subscribers ====')
print('Males: %.1f%%' % (100*subs_gender[1]/subs_total))
print('Females: %.1f%%' % (100*subs_gender[2]/subs_total))
print('Unknown: %.1f%%' % (100*subs_gender[0]/subs_total))
print('\n')
print('==== Gender Distribution of Non-Subscribers ====')
print('Males: %.1f%%' % (100*cust_gender[1]/cust_total))
print('Females: %.1f%%' % (100*cust_gender[2]/cust_total))
print('Unknown: %.1f%%' % (100*cust_gender[0]/cust_total))

==== Gender Distribution of Subscribers ====
Males: 76.1%
Females: 23.4%
Unknown: 0.5%


==== Gender Distribution of Non-Subscribers ====
Males: 0.5%
Females: 0.3%
Unknown: 99.2%


In [17]:
# Calculate fraction of non-subscribers among riders whose genders are unknown
unknown_total = subs_gender[0] + cust_gender[0]
print('%% of non-subscribers among riders whose gender are unknown: %.1f%%' % (100*cust_gender[0]/unknown_total))

% of non-subscribers among riders whose gender are unknown: 95.9%


<b>So if the gender of the rider is unknown, there's a 96% chance that the rider is a customer!!!</b>