<a id='sec0'></a>
# TomekLinks-RandomUnderSampling (TL-RUS)
- Test Tomek Links combined with Undersampling to make sure the imbalance is removed

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from collections import Counter

import my_scoring_functions as msf

%matplotlib inline

<a id='sec2'></a>
# Import data (only 2016 data used)

In [2]:
data = pd.read_csv('../data/for_predictions/dataset_2016_membership_pred.csv')

In [3]:
data.head()

Unnamed: 0,Trip Duration,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude,start_month,start_day,start_hour,start_dayofweek,stop_hour,Gender,Birth Year,User Type
0,496,40.735324,-73.998004,40.728419,-73.98714,4,1,0,4,0,1,1984.0,Subscriber
1,104,40.717227,-73.988021,40.715595,-73.98703,4,1,0,4,0,1,1970.0,Subscriber
2,128,40.69682,-73.937569,40.693398,-73.939877,4,1,0,4,0,1,1986.0,Subscriber
3,1197,40.766638,-73.953483,40.751551,-73.993934,4,1,0,4,0,1,1981.0,Subscriber
4,419,40.741776,-74.001497,40.739323,-74.008119,4,1,0,4,0,2,1982.0,Subscriber


In [4]:
data.describe().round(0)

Unnamed: 0,Trip Duration,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude,start_month,start_day,start_hour,start_dayofweek,stop_hour,Gender,Birth Year
count,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0,12499176.0
mean,806.0,41.0,-74.0,41.0,-74.0,7.0,16.0,14.0,3.0,14.0,1.0,1978.0
std,7540.0,0.0,0.0,0.0,0.0,3.0,9.0,5.0,2.0,5.0,0.0,12.0
min,61.0,0.0,-74.0,0.0,-74.0,1.0,1.0,0.0,0.0,0.0,0.0,1885.0
25%,360.0,41.0,-74.0,41.0,-74.0,5.0,8.0,9.0,1.0,9.0,1.0,1970.0
50%,580.0,41.0,-74.0,41.0,-74.0,7.0,16.0,15.0,3.0,15.0,1.0,1981.0
75%,955.0,41.0,-74.0,41.0,-74.0,10.0,23.0,18.0,4.0,18.0,1.0,1987.0
max,8933552.0,41.0,0.0,41.0,0.0,12.0,31.0,23.0,6.0,23.0,2.0,2001.0


# Setting up Data (label subscriber as class=0)

In [5]:
# Create X and y for sklearn
X = data[data.columns[:-1]].astype(np.float)
y = (data['User Type'] != 'Subscriber').values.astype(np.int)

<b>Class frequency in the original dataset</b>

In [6]:
print('==== Before EditedNearestNeighbours ====')
total_number_rides = len(y)
total_number_subscribers = Counter(y)[0]
total_number_nonsubscribers = Counter(y)[1]
subs_perc = 100*total_number_subscribers / total_number_rides
nonsubs_perc = 100*total_number_nonsubscribers / total_number_rides
print('Total number of Rides: %d' % total_number_rides)
print('Total number of Rides by Subscribers: %d (%.2f%% of total rides)' % (total_number_subscribers, subs_perc))
print('Total number of Rides by Non-subscribers: %d (%.2f%% of total rides)' % (total_number_nonsubscribers, nonsubs_perc))

==== Before EditedNearestNeighbours ====
Total number of Rides: 12499176
Total number of Rides by Subscribers: 12463924 (99.72% of total rides)
Total number of Rides by Non-subscribers: 35252 (0.28% of total rides)


<b>Class frequency in after TomekLinks</b>

In [7]:
%%time
tl = TomekLinks(n_jobs=4, ratio='majority')
X_res, y_res = tl.fit_sample(X, y)

CPU times: user 53min 30s, sys: 7.87 s, total: 53min 38s
Wall time: 18min 12s


In [8]:
print('==== After EditedNearestNeighbours ====')
total_number_rides = len(y_res)
total_number_subscribers = Counter(y_res)[0]
total_number_nonsubscribers = Counter(y_res)[1]
subs_perc = 100*total_number_subscribers / total_number_rides
nonsubs_perc = 100*total_number_nonsubscribers / total_number_rides
print('Total number of Rides: %d' % total_number_rides)
print('Total number of Rides by Subscribers: %d (%.2f%% of total rides)' % (total_number_subscribers, subs_perc))
print('Total number of Rides by Non-subscribers: %d (%.2f%% of total rides)' % (total_number_nonsubscribers, nonsubs_perc))

==== After EditedNearestNeighbours ====
Total number of Rides: 12481462
Total number of Rides by Subscribers: 12446210 (99.72% of total rides)
Total number of Rides by Non-subscribers: 35252 (0.28% of total rides)


<b>Further data reduction by random undersampling</b>

In [9]:
%%time
rus = RandomUnderSampler(ratio='majority')
X_res2, y_res2 = rus.fit_sample(X_res, y_res)

CPU times: user 6.07 s, sys: 1.1 s, total: 7.16 s
Wall time: 7.21 s


In [10]:
print('==== After EditedNearestNeighbours ====')
total_number_rides = len(y_res2)
total_number_subscribers = Counter(y_res2)[0]
total_number_nonsubscribers = Counter(y_res2)[1]
subs_perc = 100*total_number_subscribers / total_number_rides
nonsubs_perc = 100*total_number_nonsubscribers / total_number_rides
print('Total number of Rides: %d' % total_number_rides)
print('Total number of Rides by Subscribers: %d (%.2f%% of total rides)' % (total_number_subscribers, subs_perc))
print('Total number of Rides by Non-subscribers: %d (%.2f%% of total rides)' % (total_number_nonsubscribers, nonsubs_perc))

==== After EditedNearestNeighbours ====
Total number of Rides: 70504
Total number of Rides by Subscribers: 35252 (50.00% of total rides)
Total number of Rides by Non-subscribers: 35252 (50.00% of total rides)
