In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [2]:
features = pd.read_hdf('../input/grabpreprocessed/preprocessed_features.h5', key='features')
features_stats = pd.read_csv('../input/grab-feature-stats/feature_stats.csv')
labels = pd.read_csv('../input/grab-safety/safety/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

In [3]:
labels.drop_duplicates(subset='bookingID', keep=False, inplace=True)
labels.reset_index(inplace=True, drop=True)

In [4]:
features_stats_labels = pd.concat([labels.set_index('bookingID'), features_stats.set_index('bookingID')], axis=1, join='inner')

In [5]:
SEED = 123
N_FOLDS = 5
kfolds = StratifiedKFold(n_splits=N_FOLDS, random_state=SEED)

In [6]:
def get_booking_details(dataframe):

    bking_index = dataframe.index.get_level_values(0).unique()
    bking_index_start = dataframe[['Accuracy']].groupby('bookingID').count().cumsum().shift(1).values.ravel()
    bking_index_start[0] = 0
    bking_index_start = bking_index_start.astype(np.int64)
    bking_index_end = dataframe[['Accuracy']].groupby('bookingID').count().cumsum().values.ravel()
    bking_index_details = dict((k, (start, end)) for k, start, end in \
                              zip(bking_index, bking_index_start, bking_index_end))
    
    return bking_index_details, len(bking_index)

bking_index_details, num_bkings = get_booking_details(features)

In [7]:
features_stats_labels.reset_index(inplace=True)

In [8]:
features_stats_labels.head()

Unnamed: 0,bookingID,label,Accuracy_median,acceleration_z_median,Speed_median,Acc_derived_median,acceleration_z_mvg_mean_10_median,Speed_mvg_mean_10_median,Acc_derived_mvg_mean_10_median,acceleration_z_mvg_max_10_median,Speed_mvg_max_10_median,Acc_derived_mvg_max_10_median,acc_xy_mag_mvg_max_10_median,Turning_aggression_mvg_mean_3_median,Accuracy_max,acceleration_z_max,Speed_max,Acc_derived_max,acceleration_z_mvg_mean_10_max,Speed_mvg_mean_10_max,Acc_derived_mvg_mean_10_max,acceleration_z_mvg_max_10_max,Speed_mvg_max_10_max,Acc_derived_mvg_max_10_max,acc_xy_mag_mvg_max_10_max,Turning_aggression_mvg_mean_3_max,Accuracy_mean,acceleration_z_mean,Speed_mean,Acc_derived_mean,acceleration_z_mvg_mean_10_mean,Speed_mvg_mean_10_mean,Acc_derived_mvg_mean_10_mean,acceleration_z_mvg_max_10_mean,Speed_mvg_max_10_mean,Acc_derived_mvg_max_10_mean,acc_xy_mag_mvg_max_10_mean,Turning_aggression_mvg_mean_3_mean,Accuracy_skew,acceleration_z_skew,Speed_skew,Acc_derived_skew,acceleration_z_mvg_mean_10_skew,Speed_mvg_mean_10_skew,Acc_derived_mvg_mean_10_skew,acceleration_z_mvg_max_10_skew,Speed_mvg_max_10_skew,Acc_derived_mvg_max_10_skew,acc_xy_mag_mvg_max_10_skew,Turning_aggression_mvg_mean_3_skew,Accuracy_kurt,acceleration_z_kurt,Speed_kurt,Acc_derived_kurt,acceleration_z_mvg_mean_10_kurt,Speed_mvg_mean_10_kurt,Acc_derived_mvg_mean_10_kurt,acceleration_z_mvg_max_10_kurt,Speed_mvg_max_10_kurt,Acc_derived_mvg_max_10_kurt,acc_xy_mag_mvg_max_10_kurt,Turning_aggression_mvg_mean_3_kurt
0,111669149733,0,3.9,9.720809,3.44848,0.0,9.717719,3.744537,0.0,10.002797,6.772398,0.39537,1.352431,0.040525,16.0,14.6043,19.630571,2.610508,10.309621,18.808216,1.248701,14.6043,19.630571,2.610508,3.038237,2.102505,4.113564,9.727105,5.221818,-0.004127,9.598554,5.194059,-0.005737,10.112717,6.801256,0.64075,1.300735,0.190103,2.815843,1.939339,0.637604,-0.163406,-8.404946,0.612383,-0.340644,-4.836066,0.24022,0.743736,0.168737,2.999215,13.854312,32.502215,-0.997459,2.219105,69.902297,-0.993329,0.95601,39.569285,-1.421546,-0.578764,-1.048804,9.906853
1,335007449205,1,3.9,9.355314,3.767982,0.0,9.360127,3.940652,0.0,9.948598,6.785564,0.467118,2.878437,0.086272,16.0,12.83673,19.367985,5.968924,10.158982,18.669111,1.463046,12.83673,19.367985,5.968924,9.930135,2.661065,4.107847,9.368752,6.029687,-0.002071,9.279038,6.025978,-0.000395,10.054258,7.532874,0.701202,3.006058,0.247322,3.313718,0.137904,0.505493,-0.476933,-9.465434,0.490839,-0.0506,-5.61888,0.20148,2.497805,1.218145,2.235854,18.141237,4.488076,-1.234717,11.794872,91.957747,-1.230107,2.280527,48.255113,-1.470979,9.981299,1.862738,7.11999
2,171798691856,0,165.044586,9.505242,13.054117,0.051764,9.504178,13.166324,0.051765,9.686891,13.830588,0.051765,0.948963,0.052909,1299.999023,12.335393,26.5,2.25,10.042629,24.75,0.5,12.335393,26.5,2.25,3.389709,1.938626,433.128296,9.495623,11.800266,0.001064,9.394928,11.806117,0.001064,9.825399,12.28484,0.268617,0.94189,0.159939,0.669287,-0.634685,-0.193155,-2.587775,-9.106169,-0.191479,-2.335082,-5.610053,-0.201268,2.525698,1.124778,2.841356,-1.16119,11.315533,-1.389839,31.086429,83.847591,-1.397634,9.853027,46.35057,-1.354413,7.160204,2.490172,11.329055
3,1520418422900,0,3.9,10.035115,17.030001,0.0,10.030879,16.794,0.001,10.670824,18.299999,0.615,1.176869,0.231685,8.576,12.256758,24.85,12.62,10.511911,23.706,2.482,12.256758,24.85,12.62,10.095981,3.021116,3.801566,9.976212,13.418656,0.018212,9.870102,13.333312,0.019129,10.57268,14.977608,1.542397,1.607386,0.363637,3.629934,-3.292779,-0.487352,-4.370074,-8.166342,-0.511745,-0.493689,-7.96101,-0.665717,2.884707,3.570715,2.945959,22.403572,22.15587,-1.338734,58.573857,70.694626,-1.320704,7.673153,70.458648,-1.064673,6.909385,14.759711,10.85579
4,798863917116,0,3.9,9.574132,10.074869,0.0,9.584945,9.781563,0.0,10.279267,12.915311,0.544618,3.411597,0.077093,8.0,14.782172,21.993572,4.337577,10.439303,21.452322,1.462146,14.782172,21.993572,4.337577,10.736982,1.176122,3.891488,9.57098,8.729576,0.01125,9.401142,8.618611,0.012896,10.481358,10.56118,0.755028,3.33009,0.142272,1.513316,0.610613,-0.014001,0.366457,-6.911241,-0.00506,-0.041885,-3.346408,-0.370846,1.48886,0.766029,2.473168,2.578956,7.309486,-1.307983,2.83675,47.600826,-1.28781,0.340279,20.336187,-1.189888,2.516804,1.031176,6.986181


In [9]:
X = features_stats_labels.drop(columns=['label'])
y = features_stats_labels['label']

In [10]:
for n, (train_index, val_index) in enumerate(kfolds.split(X, y), 1):
    
    if n in (2,3,4,1):
        continue
    
    train_ids = X.iloc[train_index, 0].values

    train_slicing_list = []
    for train_id in train_ids:
        start, end = bking_index_details[train_id]
        train_slicing_list += list(range(start, end))
    train_features = features.iloc[train_slicing_list,:].values
    train_label = features_stats_labels.loc[features_stats_labels['bookingID'].isin(train_ids),'label'].values    
    train_features_stats = features_stats_labels.drop(columns=['bookingID','label']).loc[features_stats_labels['bookingID'].isin(train_ids),:].values
    
    feature_scaler = StandardScaler()
    train_features = feature_scaler.fit_transform(train_features)
    train_data = np.zeros((len(train_ids),1800,train_features.shape[1]), dtype=np.float32)

    feature_stats_scaler = StandardScaler()
    train_features_stats = feature_stats_scaler.fit_transform(train_features_stats)
    
    train_feature_index_details = dict()
    current_index = 0
    for bkingid in train_ids:
        start, end = bking_index_details[bkingid]
        bkingid_len = end - start
        train_feature_index_details[bkingid] = (current_index, current_index+ bkingid_len)
        current_index += bkingid_len
    for index, bkingid in enumerate(train_ids):
        start, end = train_feature_index_details[bkingid]
        data_len = end - start
        train_data[index][:data_len] = train_features[start:end]
        assert data_len == bking_index_details[bkingid][1] - bking_index_details[bkingid][0], 'Mismatch of data size'

    np.save(f'Fold{n}_train_data.npy', train_data)
    np.save(f'Fold{n}_train_label.npy', train_label)
    np.save(f'Fold{n}_train_data_stats.npy', train_features_stats)
    np.save(f'Fold{n}_train_data_mean.npy',feature_scaler.mean_)
    np.save(f'Fold{n}_train_data_scale.npy',feature_scaler.scale_)
    np.save(f'Fold{n}_train_data_stats_mean.npy',feature_stats_scaler.mean_)
    np.save(f'Fold{n}_train_data_stats_scale.npy',feature_stats_scaler.scale_)

    print(f'Successfully saved training data and labels for Fold {n}')

    val_ids = X.iloc[val_index, 0].values

    val_slicing_list = []
    for val_id in val_ids:
        start, end = bking_index_details[val_id]
        val_slicing_list += list(range(start, end))

    val_features = features.iloc[val_slicing_list,:].values
    val_label = features_stats_labels.loc[features_stats_labels['bookingID'].isin(val_ids),'label'].values
    val_features_stats = features_stats_labels.drop(columns=['bookingID','label']).loc[features_stats_labels['bookingID'].isin(val_ids),:].values

    val_features = feature_scaler.transform(val_features)
    val_data = np.zeros((len(val_ids),1800, val_features.shape[1]), dtype=np.float32)

    val_features_stats = feature_stats_scaler.transform(val_features_stats)
    
    val_feature_index_details = dict()
    current_index = 0
    for bkingid in val_ids:
        start, end = bking_index_details[bkingid]
        bkingid_len = end - start
        val_feature_index_details[bkingid] = (current_index, current_index+ bkingid_len)
        current_index += bkingid_len
    for index, bkingid in enumerate(val_ids):
        start, end = val_feature_index_details[bkingid]
        data_len = end - start
        val_data[index][:data_len] = val_features[start:end]
        assert data_len == bking_index_details[bkingid][1] - bking_index_details[bkingid][0], 'Mismatch of data size'

    np.save(f'Fold{n}_val_data.npy', val_data)
    np.save(f'Fold{n}_val_label.npy', val_label)
    np.save(f'Fold{n}_val_data_stats.npy', val_features_stats)
    print(f'Successfully saved validation data and labels for Fold {n}')

Successfully saved training data and labels for Fold 5
Successfully saved validation data and labels for Fold 5


In [11]:
print('Feature Column, Feature Name')
for index, col in enumerate(features.columns):
    print(index, " : ", col)
    
print('\n\nFeatures Stats Column, Feature Stats Name')
for index, col in enumerate(features_stats_labels.columns):
    if col not in ['bookingID','label']:
        print(index, " : ", col)

Feature Column, Feature Name
0  :  Accuracy
1  :  acceleration_z
2  :  Speed
3  :  Acc_derived
4  :  acceleration_z_mvg_mean_10
5  :  Speed_mvg_mean_10
6  :  Acc_derived_mvg_mean_10
7  :  acceleration_z_mvg_max_10
8  :  Speed_mvg_max_10
9  :  Acc_derived_mvg_max_10
10  :  acc_xy_mag_mvg_max_10
11  :  Turning_aggression_mvg_mean_3


Features Stats Column, Feature Stats Name
2  :  Accuracy_median
3  :  acceleration_z_median
4  :  Speed_median
5  :  Acc_derived_median
6  :  acceleration_z_mvg_mean_10_median
7  :  Speed_mvg_mean_10_median
8  :  Acc_derived_mvg_mean_10_median
9  :  acceleration_z_mvg_max_10_median
10  :  Speed_mvg_max_10_median
11  :  Acc_derived_mvg_max_10_median
12  :  acc_xy_mag_mvg_max_10_median
13  :  Turning_aggression_mvg_mean_3_median
14  :  Accuracy_max
15  :  acceleration_z_max
16  :  Speed_max
17  :  Acc_derived_max
18  :  acceleration_z_mvg_mean_10_max
19  :  Speed_mvg_mean_10_max
20  :  Acc_derived_mvg_mean_10_max
21  :  acceleration_z_mvg_max_10_max
22  :  Spe