In [76]:
import pandas as pd
import numpy as np

# oversampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

In [79]:
help(SMOTENC) # Synthetic Minority Over-sampling Technique for Nominal and Continuous

Help on class SMOTENC in module imblearn.over_sampling._smote:

class SMOTENC(SMOTE)
 |  SMOTENC(categorical_features, sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=1)
 |  
 |  Synthetic Minority Over-sampling Technique for Nominal and Continuous
 |  (SMOTE-NC).
 |  
 |  Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
 |  categorical features.
 |  
 |  Read more in the :ref:`User Guide <smote_adasyn>`.
 |  
 |  Parameters
 |  ----------
 |  categorical_features : ndarray, shape (n_cat_features,) or (n_features,)
 |      Specified which features are categorical. Can either be:
 |  
 |      - array of indices specifying the categorical features;
 |      - mask array of shape (n_features, ) and ``bool`` dtype for which
 |        ``True`` indicates the categorical features.
 |  
 |  sampling_strategy : float, str, dict or callable, (default='auto')
 |      Sampling information to resample the data set.
 |  
 |      - When ``float``, it corresponds

In [5]:
X_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_X.csv')
y_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_y.csv')
X_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_X.csv')
y_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_y.csv')

In [104]:
X_17.head()

Unnamed: 0,id_trip,mode_f,duration,distance_m,magnitude,carddir_f,start_down,end_downto,weekday,temporal_c,precip,temperatur,startrush,endrush,thrurush,startclust,endclust,land_use_s_f,land_use_e_f
0,150744,0,862,9935.922336,0.304086,0,1,1,1,4,0.0,16.910884,1,1,1,1,6,0,0
1,173651,0,2856,15283.760539,0.232414,1,0,0,1,2,9.972328e-07,18.007062,0,0,0,3,3,1,1
2,149086,1,2972,7693.525202,0.048575,1,1,1,1,2,9.972328e-07,18.007062,0,0,0,2,2,2,2
3,358412,2,768,992.314303,0.347807,2,1,1,1,2,9.972328e-07,16.815241,0,0,0,2,2,3,2
4,350121,0,2370,33250.148027,0.338182,2,0,0,1,2,9.972328e-07,18.007062,0,0,0,0,0,1,1


In [105]:
print('unique number of ids =', X_17.id_trip.nunique())

unique number of ids = 61263


In [108]:
## for oversampling
# ros = RandomOverSampler(random_state=0)
# ros = SMOTE(random_state=0)
categorical_features = [True,True,False,False,False,True,True,True,True,True,False,\
                        False,True,True,True,True,True,True,True]
ros = SMOTENC(categorical_features, random_state=0)

In [109]:
%%time
oversample_X_17, oversample_y_17 = ros.fit_resample(X_17, y_17)

  y = column_or_1d(y, warn=True)


In [110]:
print('Before: num of classes', y_17['purpose_f'].value_counts())

Before: num of classes 3    22464
1    15851
5     8049
2     7723
7     2714
6     2175
0     1379
4      908
Name: purpose_f, dtype: int64


In [111]:
print('After: num of classes', pd.DataFrame(oversample_y_17)[0].value_counts())

After: num of classes 7    22464
6    22464
5    22464
4    22464
3    22464
2    22464
1    22464
0    22464
Name: 0, dtype: int64


In [116]:
over_X_17 = pd.DataFrame(oversample_X_17, columns=X_17.columns)
over_y_17 = pd.DataFrame(oversample_y_17, columns=['purpose_f'])

In [113]:
over_X_17['id_trip'].value_counts()

361380.0    192
428733.0    190
458074.0    182
462134.0    181
430813.0    180
           ... 
340996.0      1
85248.0       1
236030.0      1
170491.0      1
49152.0       1
Name: id_trip, Length: 61263, dtype: int64

In [114]:
over_X_17.loc[over_X_17['id_trip'] == 361380].head()

Unnamed: 0,id_trip,mode_f,duration,distance_m,magnitude,carddir_f,start_down,end_downto,weekday,temporal_c,precip,temperatur,startrush,endrush,thrurush,startclust,endclust,land_use_s_f,land_use_e_f
59759,361380.0,0.0,2538.0,43533.798074,0.239627,12.0,1.0,0.0,1.0,3.0,4.2e-05,5.679489,0.0,0.0,0.0,6.0,11.0,0.0,1.0
103715,361380.0,0.0,3616.934135,42754.691887,0.145183,12.0,1.0,0.0,1.0,3.0,3e-06,27.903299,0.0,0.0,0.0,6.0,3.0,2.0,1.0
104031,361380.0,0.0,3111.420369,43101.859532,0.239089,12.0,1.0,0.0,1.0,3.0,0.000157,15.236827,0.0,0.0,0.0,6.0,11.0,0.0,1.0
104197,361380.0,0.0,3603.181,44351.347793,0.216515,0.0,1.0,0.0,1.0,4.0,1e-06,18.580277,0.0,0.0,0.0,6.0,3.0,0.0,1.0
104310,361380.0,0.0,2809.9316,43608.450915,0.231918,12.0,1.0,0.0,1.0,3.0,3.2e-05,8.414588,0.0,0.0,0.0,6.0,11.0,0.0,1.0


In [97]:
print('unique number of ids =',over_X_17.id_trip.nunique())
## same amount

unique number of ids = 56340


In [118]:
over_X_17.to_csv('../../../Data/model_inputs/oversampled_X_2017.csv',index=False)
over_y_17.to_csv('../../../Data/model_inputs/oversampled_y_2017.csv',index=False)