In [2]:
import pandas as pd
import numpy as np

# oversampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

In [3]:
help(SMOTENC) # Synthetic Minority Over-sampling Technique for Nominal and Continuous

Help on class SMOTENC in module imblearn.over_sampling._smote:

class SMOTENC(SMOTE)
 |  SMOTENC(categorical_features, sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=1)
 |  
 |  Synthetic Minority Over-sampling Technique for Nominal and Continuous
 |  (SMOTE-NC).
 |  
 |  Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
 |  categorical features.
 |  
 |  Read more in the :ref:`User Guide <smote_adasyn>`.
 |  
 |  Parameters
 |  ----------
 |  categorical_features : ndarray, shape (n_cat_features,) or (n_features,)
 |      Specified which features are categorical. Can either be:
 |  
 |      - array of indices specifying the categorical features;
 |      - mask array of shape (n_features, ) and ``bool`` dtype for which
 |        ``True`` indicates the categorical features.
 |  
 |  sampling_strategy : float, str, dict or callable, (default='auto')
 |      Sampling information to resample the data set.
 |  
 |      - When ``float``, it corresponds

In [4]:
X_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_X.csv')
y_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_y.csv')
X_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_X.csv')
y_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_y.csv')

In [5]:
X_17.head()

Unnamed: 0,id_trip,mode,duration,distance_m,magnitude,carddir,downtown_s,downtown_e,weekday,temp_clus,...,endx,endy,hour,morning,midday,afternoon,evening,midnight,land_use_s,land_use_e
0,150744,2,862,9935.922336,0.304086,0,True,True,1,4,...,7631864.0,1250415.0,10,True,False,False,False,False,0,0
1,35763,2,1873,6832.113937,0.00275,1,False,False,1,5,...,7641941.0,1236732.0,12,False,True,False,False,False,1,1
2,323826,2,1165,12233.968564,0.309908,2,True,False,1,5,...,7616671.0,1252917.0,12,False,True,False,False,False,0,1
3,375668,2,820,3612.405991,0.20946,3,True,True,1,1,...,7630941.0,1253445.0,13,False,True,False,False,False,2,2
4,212877,2,1316,26428.385191,0.229425,4,True,False,1,3,...,7646416.0,1264378.0,14,False,False,True,False,False,0,1


In [6]:
print('unique number of ids =', X_17.id_trip.nunique())

unique number of ids = 22948


In [7]:
X_17

Unnamed: 0,id_trip,mode,duration,distance_m,magnitude,carddir,downtown_s,downtown_e,weekday,temp_clus,...,endx,endy,hour,morning,midday,afternoon,evening,midnight,land_use_s,land_use_e
0,150744,2,862,9935.922336,0.304086,0,True,True,1,4,...,7.631864e+06,1.250415e+06,10,True,False,False,False,False,0,0
1,35763,2,1873,6832.113937,0.002750,1,False,False,1,5,...,7.641941e+06,1.236732e+06,12,False,True,False,False,False,1,1
2,323826,2,1165,12233.968564,0.309908,2,True,False,1,5,...,7.616671e+06,1.252917e+06,12,False,True,False,False,False,0,1
3,375668,2,820,3612.405991,0.209460,3,True,True,1,1,...,7.630941e+06,1.253445e+06,13,False,True,False,False,False,2,2
4,212877,2,1316,26428.385191,0.229425,4,True,False,1,3,...,7.646416e+06,1.264378e+06,14,False,False,True,False,False,0,1
5,14699,0,1668,1275.056162,0.006315,0,True,True,1,5,...,7.631896e+06,1.244344e+06,16,False,False,True,False,False,2,2
6,443357,0,1127,650.601955,0.164362,5,True,True,1,2,...,7.631361e+06,1.245062e+06,17,False,False,False,True,False,2,2
7,256254,0,129,85.844452,0.055556,3,True,True,1,2,...,7.632808e+06,1.244086e+06,17,False,False,False,True,False,2,2
8,142829,0,217,160.435726,0.306655,6,True,True,1,2,...,7.631357e+06,1.244778e+06,17,False,False,False,True,False,2,2
9,276155,3,1743,7956.412449,0.257380,7,True,True,1,1,...,7.630498e+06,1.249978e+06,20,False,False,False,True,False,2,0


In [22]:
## for oversampling
# ros = RandomOverSampler(random_state=0)
# ros = SMOTE(random_state=0)
categorical_features = [True,True,False,False,False,True,True,True,True,True,False,\
                        False,False,False,False,False,True,True,True,True,True,True,True,True]
ros = SMOTENC(categorical_features, random_state=0)

## NOTE: x and y of start and end of trip will be non-categorical

In [None]:
X_17.columns[categorical_features]

In [None]:
%%time
oversample_X_17, oversample_y_17 = ros.fit_resample(X_17, y_17)

  y = column_or_1d(y, warn=True)


In [None]:
print('Before: num of classes', y_17['purpose_f'].value_counts())

In [None]:
print('After: num of classes', pd.DataFrame(oversample_y_17)[0].value_counts())

In [116]:
over_X_17 = pd.DataFrame(oversample_X_17, columns=X_17.columns)
over_y_17 = pd.DataFrame(oversample_y_17, columns=['purpose_f'])

In [113]:
over_X_17['id_trip'].value_counts()

361380.0    192
428733.0    190
458074.0    182
462134.0    181
430813.0    180
           ... 
340996.0      1
85248.0       1
236030.0      1
170491.0      1
49152.0       1
Name: id_trip, Length: 61263, dtype: int64

In [114]:
over_X_17.loc[over_X_17['id_trip'] == 361380].head()

Unnamed: 0,id_trip,mode_f,duration,distance_m,magnitude,carddir_f,start_down,end_downto,weekday,temporal_c,precip,temperatur,startrush,endrush,thrurush,startclust,endclust,land_use_s_f,land_use_e_f
59759,361380.0,0.0,2538.0,43533.798074,0.239627,12.0,1.0,0.0,1.0,3.0,4.2e-05,5.679489,0.0,0.0,0.0,6.0,11.0,0.0,1.0
103715,361380.0,0.0,3616.934135,42754.691887,0.145183,12.0,1.0,0.0,1.0,3.0,3e-06,27.903299,0.0,0.0,0.0,6.0,3.0,2.0,1.0
104031,361380.0,0.0,3111.420369,43101.859532,0.239089,12.0,1.0,0.0,1.0,3.0,0.000157,15.236827,0.0,0.0,0.0,6.0,11.0,0.0,1.0
104197,361380.0,0.0,3603.181,44351.347793,0.216515,0.0,1.0,0.0,1.0,4.0,1e-06,18.580277,0.0,0.0,0.0,6.0,3.0,0.0,1.0
104310,361380.0,0.0,2809.9316,43608.450915,0.231918,12.0,1.0,0.0,1.0,3.0,3.2e-05,8.414588,0.0,0.0,0.0,6.0,11.0,0.0,1.0


In [97]:
print('unique number of ids =',over_X_17.id_trip.nunique())
## same amount

unique number of ids = 56340


In [118]:
over_X_17.to_csv('../../../Data/model_inputs/oversampled_X_2017.csv',index=False)
over_y_17.to_csv('../../../Data/model_inputs/oversampled_y_2017.csv',index=False)