# Feature Engineering

This notebook contains a Feature Engineering, training vs validation split and chunking the training into 10 subsets. The RAM requirements for engineering of many features are quite high therefore a lot of features are created and saved here and feature selection is done later on.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set()

import gc

In [2]:
path = '~/.kaggle/competitions/talkingdata-adtracking-fraud-detection/'

# Load and downcast data types

The datasets of this competition were very big and therefore downcasting data types decreases RAM requirements by a large amount.

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    if df.loc[:, 'device'].dtype != 'int16':
        df.loc[:, 'device'] = df.loc[:, 'device'].astype(np.int16)
    if df.loc[:, 'os'].dtype != 'int16':
        df.loc[:, 'os'] = df.loc[:, 'os'].astype(np.int16)
    if df.loc[:, 'channel'].dtype != 'int16':
        df.loc[:, 'channel'] = df.loc[:, 'channel'].astype(np.int16)
    if 'is_attributed' in df.columns:
        df.loc[:, 'is_attributed'] = df.loc[:, 'is_attributed'].astype(np.int8)
    
    if 'day' in df.columns and df.loc[:, 'day'].dtype != 'int8':
        df.loc[:, 'day'] = df.loc[:, 'day'].astype(np.int8)
    
    return df

In [5]:
cols_to_read = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

df_train = pd.read_csv(path+'train.csv.zip', usecols=cols_to_read)#, nrows=6000000)
df_train.loc[:, 'click_time'] = pd.to_datetime(df_train.click_time)
df_train = downcast_dtypes(df_train)
print('shape df_train: ' + str(df_train.shape))
df_train.head()

shape df_train: (184903890, 7)


Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [6]:
cols_to_read = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

df_test = pd.read_csv(path+'test.csv.zip', usecols=cols_to_read)#, nrows=1000000)
df_test.loc[:, 'click_time'] = pd.to_datetime(df_test.click_time)
df_test = downcast_dtypes(df_test)
print('shape df_test: ' + str(df_test.shape))
df_test.head()

shape df_test: (18790469, 6)


Unnamed: 0,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00
1,119901,9,1,3,466,2017-11-10 04:00:00
2,72287,21,1,19,128,2017-11-10 04:00:00
3,78477,15,1,13,111,2017-11-10 04:00:00
4,123080,12,1,13,328,2017-11-10 04:00:00


In [7]:
df_test_supplement = pd.read_csv(path+'test_supplement.csv.zip', usecols=cols_to_read)#, nrows=50000)
df_test_supplement.loc[:, 'click_time'] = pd.to_datetime(df_test_supplement.click_time)
df_test_supplement = downcast_dtypes(df_test_supplement)
print('shape df_test_supplement: ' + str(df_test_supplement.shape))
df_test_supplement.head()

shape df_test_supplement: (57537505, 6)


Unnamed: 0,ip,app,device,os,channel,click_time
0,43570,3,1,18,379,2017-11-09 14:23:39
1,80528,3,1,13,379,2017-11-09 14:23:51
2,32323,3,1,13,379,2017-11-09 14:25:57
3,42887,3,1,17,379,2017-11-09 14:26:03
4,119289,58,1,30,120,2017-11-09 14:26:41


In [8]:
df_train.loc[:, 'hour'] = df_train.click_time.dt.hour.astype(np.int8)
df_train.loc[:, 'day'] = df_train.click_time.dt.day.astype(np.int8)
df_train = downcast_dtypes(df_train)

df_test.loc[:, 'hour'] = df_test.click_time.dt.hour.astype(np.int8)
df_test.loc[:, 'day'] = df_test.click_time.dt.day.astype(np.int8)
df_test = downcast_dtypes(df_test)
    
df_test_supplement.loc[:, 'hour'] = df_test_supplement.click_time.dt.hour.astype(np.int8)
df_test_supplement.loc[:, 'day'] = df_test_supplement.click_time.dt.day.astype(np.int8)
df_test_supplement = downcast_dtypes(df_test_supplement)
    
def add_time_features(df):
    df.loc[:, 'minute_of_day'] = df.click_time.dt.hour*60 + df.click_time.dt.minute
    df.loc[:, 'minute_of_day'] = df.minute_of_day.astype(np.uint16)
    
    df.loc[:, 'timeframe_11'] = df.minute_of_day // 11
    df.loc[:, 'timeframe_11'] = df.timeframe_11.astype(np.uint8)
    
    df.loc[:, 'timeframe_17'] = df.minute_of_day // 17
    df.loc[:, 'timeframe_17'] = df.timeframe_17.astype(np.uint8)

add_time_features(df_train)
add_time_features(df_test)
add_time_features(df_test_supplement)

# Feature Engineering

Given the nature of the dataset (small amount of nominal categoric columns) creating features by using grouping and aggregating different functions proved to be most useful. Useful aggregating functions were `count`, `nunique`, `var`, `mean` and `cumcount`. In addition the typically used feature `hour` is a somewhat arbitrary timeframe. Any other timeframe makes as much sense as `hour` does. Trying out other timeframes might be worth a try. In this case the tried timeframes were 11 and 17 minutes.

In competitions the problem statement is slightly different from real world projects. Instead of finding a model that scores best on unseen data one has to find a model that scores best on the private subset of the test set. This allows using the test set for Feature Engineering what was absolutely necessary in this competition for obtaining a good score. Even further, one should use `test_supplement.csv` instead of `test.csv` for as many features as possible because `test.csv` is actually an incomplete subset.

In [None]:
len_train = df_train.shape[0]

df_concat = pd.concat([df_train, df_test_supplement])

del df_train, df_test_supplement

In [None]:
predictors = []

## GroupBy Aggregate features

In [None]:
def add_features(df, df_test, predictors):
    
    gby_cols = [
                    {'gby': ['ip'],                     'agg': 'channel', 'kind': 'nunique'},
                    {'gby': ['ip', 'day'],              'agg': 'hour',    'kind': 'nunique'},
                    {'gby': ['ip'],                     'agg': 'app',     'kind': 'nunique'},
                    {'gby': ['ip', 'app'],              'agg': 'os',      'kind': 'nunique'},
                    {'gby': ['ip'],                     'agg': 'device',  'kind': 'nunique'},
                    {'gby': ['app'],                    'agg': 'channel', 'kind': 'nunique'},
                    {'gby': ['ip', 'device', 'os'],     'agg': 'app',     'kind': 'nunique'},
                    {'gby': ['ip', 'day', 'hour'],      'agg': 'channel', 'kind': 'count'},
                    {'gby': ['ip', 'app'],              'agg': 'channel', 'kind': 'count'},
                    {'gby': ['ip', 'app', 'os'],        'agg': 'channel', 'kind': 'count'},
                    {'gby': ['ip', 'day', 'channel'],   'agg': 'hour',    'kind': 'var'},
                    {'gby': ['ip', 'app', 'os'],        'agg': 'hour',    'kind': 'var'},
                    {'gby': ['ip', 'app', 'channel'],   'agg': 'day',     'kind': 'var'},
                    {'gby': ['ip', 'app', 'channel'],   'agg': 'hour',    'kind': 'mean'},
        
                    {'gby': ['ip', 'day', 'timeframe_11'],   'agg': 'channel',  'kind': 'count'},
                    {'gby': ['ip', 'day', 'timeframe_17'],   'agg': 'channel',  'kind': 'count'},
        
                    {'gby': ['ip', 'day', 'channel'],   'agg': 'timeframe_11',    'kind': 'var'},
                    {'gby': ['ip', 'day', 'channel'],   'agg': 'timeframe_17',    'kind': 'var'},
        
                    {'gby': ['ip', 'app', 'os'],        'agg': 'timeframe_11',    'kind': 'var'},
                    {'gby': ['ip', 'app', 'os'],        'agg': 'timeframe_17',    'kind': 'var'},
        
                    {'gby': ['ip', 'app', 'channel'],   'agg': 'timeframe_11',    'kind': 'mean'},
                    {'gby': ['ip', 'app', 'channel'],   'agg': 'timeframe_17',    'kind': 'mean'},
        
                    {'gby': ['ip'],   'agg': 'os',    'kind': 'count'}
                ]
    
    for gby_col in gby_cols:
    
        cols = gby_col['gby'] + [gby_col['agg']]
        
        colname = gby_col['agg'] + '_' + gby_col['kind'] + '_gby_' + '_'.join(gby_col['gby'])
        predictors.append(colname)
        
        print('Processing ' + colname)
        filename = 'Saves/{}.csv'.format(colname)
        
        if os.path.exists(filename):
            gp = pd.read_csv(filename)
            df = df.merge(gp, on=gby_col['gby'], how='left')
            df_test = df_test.merge(gp, on=gby_col['gby'], how='left')
    
        else:
            if gby_col['kind'] == 'count':
                gp = df[cols].groupby(by=gby_col['gby'])[gby_col['agg']].count().reset_index() \
                             .rename(index=str, columns={gby_col['agg']: colname})
                df = df.merge(gp, on=gby_col['gby'], how='left')
                df_test = df_test.merge(gp, on=gby_col['gby'], how='left')
                
            if gby_col['kind'] == 'mean':
                gp = df[cols].groupby(by=gby_col['gby'])[gby_col['agg']].mean().reset_index() \
                             .rename(index=str, columns={gby_col['agg']: colname})
                df = df.merge(gp, on=gby_col['gby'], how='left')
                df_test = df_test.merge(gp, on=gby_col['gby'], how='left')
                
            if gby_col['kind'] == 'var':
                gp = df[cols].groupby(by=gby_col['gby'])[gby_col['agg']].var().reset_index() \
                             .rename(index=str, columns={gby_col['agg']: colname})
                df = df.merge(gp, on=gby_col['gby'], how='left')
                df_test = df_test.merge(gp, on=gby_col['gby'], how='left')
                
            if gby_col['kind'] == 'nunique':
                gp = df[cols].groupby(by=gby_col['gby'])[gby_col['agg']].nunique().reset_index() \
                             .rename(index=str, columns={gby_col['agg']: colname})
                df = df.merge(gp, on=gby_col['gby'], how='left')
                df_test = df_test.merge(gp, on=gby_col['gby'], how='left')

            gp.to_csv(filename,index=False)
        
        if gby_col['kind'] in ['count', 'nunique']:
            # downcast data types
            if np.max(df.iloc[:, -1]) < 2**8/2 - 1:
                dtype = np.int8
            elif np.max(df.iloc[:, -1]) < 2**16/2 - 1:
                dtype = np.int16
            elif np.max(df.iloc[:, -1]) < 2**32/2 - 1:
                dtype = np.int32
            else:
                dtype = np.int64
            
            df.iloc[:, -1:] = df.iloc[:, -1:].fillna(-1).astype(dtype)
            df_test.iloc[:, -1:] = df_test.iloc[:, -1:].fillna(-1).astype(dtype)
        
        del gp
        gc.collect()    

    return df, df_test, predictors

df_concat, df_test, predictors = add_features(df_concat, df_test, predictors)

In [None]:
df_train = df_concat.iloc[:len_train, :]

del df_concat # bye-bye df_test_supplement
gc.collect()

In [None]:
len_train = df_train.shape[0]
len_test = df_test.shape[0]

df_concat = pd.concat([df_train, df_test]).reset_index(drop=True)
del df_test, df_train
gc.collect()

df_concat.shape

In [None]:
def add_cumcount_features(df, predictors):
    
    gby_cols = [
                    {'gby': ['ip', 'device', 'os'],     'agg': 'app',     'kind': 'cumcount'},
                    {'gby': ['ip'],                     'agg': 'os',      'kind': 'cumcount'}
               ]
    
    for gby_col in gby_cols:
    
        cols = gby_col['gby'] + [gby_col['agg']]
        
        colname = gby_col['agg'] + '_' + gby_col['kind'] + '_gby_' + '_'.join(gby_col['gby'])
        predictors.append(colname)
        
        print('Processing ' + colname)
        filename='Saves/{}.csv'.format(colname)
        
        if os.path.exists(filename):
            gp = pd.read_csv(filename,header=None)
            df[colname] = gp
            
            del gp
            gc.collect()  
        else:
            gp = df[cols].groupby(by=gby_col['gby'])[gby_col['agg']].cumcount()
            df[colname] = gp.values.astype(np.int32)

    return df, predictors

df_concat, predictors = add_cumcount_features(df_concat, predictors)

## Next click features

Next click features have proven to be one of the most important features for this dataset as proven by various kernels. Therefore these will be added here aswell.

In [None]:
def add_next_click_features(df, predictors):
    print('Processing next_click')
    
    new_feature = 'next_click'
    filename='Saves/next_click.csv'
    
    if os.path.exists(filename):
        print('loading from save file')
        df_temp = pd.read_csv(filename)
        QQ = df_temp.QQ
        df.loc[:, 'category'] = df_temp.category
        del df_temp
    else:
        D=2**26
        df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                         + "_" + df['os'].astype(str)).apply(hash) % D
        click_buffer= np.full(D, 3000000000, dtype=np.uint32)

        df['epochtime'] = df['click_time'].astype(np.int64) // 10**9
        next_clicks= []
        for category, t in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
            next_clicks.append(click_buffer[category]-t)
            click_buffer[category]= t
            
        del(click_buffer)
        QQ= list(reversed(next_clicks))

        print('saving')
        df_temp = pd.DataFrame()
        df_temp.loc[:, 'QQ'] = QQ
        df_temp.loc[:, 'category'] = df.category
        df_temp.to_csv(filename,index=False)
        del df_temp

    df[new_feature] = QQ
    df[new_feature] = df[new_feature].astype(np.int32)
    predictors.append(new_feature)

    df[new_feature+'_shift'] = pd.DataFrame(QQ).astype(np.float32).shift(+1).values
    predictors.append(new_feature+'_shift')
    
    del QQ
    gc.collect()
    
    return df, predictors

df_concat, predictors = add_next_click_features(df_concat, predictors)

In [None]:
# Calculate the time to next click for each group
for spec in [{'groupby': ['ip']}, {'groupby': ['ip', 'app']}, {'groupby': ['ip', 'channel']}, 
             {'groupby': ['ip', 'os']}]:
    
    # Name of new feature
    new_feature = '{}_nextClick'.format('_'.join(spec['groupby']))    
    
    # Unique list of features to select
    all_features = spec['groupby'] + ['click_time']
    
    # Run calculation
    print(">> Grouping by {}, and saving time to next click in: {}".format(spec['groupby'], new_feature))
    gb = df_concat[all_features].groupby(spec['groupby']).click_time \
                                .transform(lambda x: x.diff().shift(-1)).dt.seconds.astype(np.float16)
    df_concat[new_feature] = df_concat[all_features].groupby(spec['groupby']).click_time \
                                 .transform(lambda x: x.diff().shift(-1)).dt.seconds.astype(np.float16)
    
gc.collect()

In [None]:
df_train = df_concat.iloc[:len_train, :]
print(df_train.shape)

df_test = df_concat.iloc[len_train:, :]
print(df_test.shape)

del df_concat
gc.collect()

In [None]:
df_train.loc[:, 'category_mod16'] = df_train.category % 2**16
df_train.loc[:, 'category_mod16'] = df_train.loc[:, 'category_mod16'].astype(np.int32)
df_train = df_train.drop('epochtime', axis=1)

df_valid.loc[:, 'category_mod16'] = df_valid.category % 2**16
df_valid.loc[:, 'category_mod16'] = df_valid.loc[:, 'category_mod16'].astype(np.int32)
df_valid = df_valid.drop('epochtime', axis=1)

df_test.loc[:, 'category_mod16'] = df_test.category % 2**16
df_test.loc[:, 'category_mod16'] = df_test.loc[:, 'category_mod16'].astype(np.int32)
df_test = df_test.drop('epochtime', axis=1)

# Train / validation split 

As proposed and reasoned in `ExploratoryDataAnalysis.ipynb` the training and validation split will be a time-based split (based on the private subset of the test set) as well as a selection of the occuring hours in the private subset of the test set.

In [None]:
from sklearn.model_selection import train_test_split

split_date = pd.Timestamp(year=2017, month=11, day=9, hour=4)

df_train = downcast_dtypes(df_train)

srs_mask = df_train.click_time < split_date

df_valid = df_train.loc[~srs_mask, :]
df_train = df_train.loc[srs_mask, :]

from sklearn.model_selection import train_test_split

df_train_val, df_valid = train_test_split(df_valid, test_size=5000000/df_valid.shape[0])

del srs_mask
gc.collect()

print('train shape: ' + str(df_train.shape))
print('train_val shape: ' + str(df_train_val.shape))
print('valid shape: ' + str(df_valid.shape))

In [None]:
df_train = pd.concat([df_train, df_train_val])

del df_train_val
gc.collect()

print('train shape: ' + str(df_train.shape))

In [None]:
# Downcast binary target column 
df_train.loc[:, 'is_attributed'] = df_train.loc[:, 'is_attributed'].astype(np.int8)
df_valid.loc[:, 'is_attributed'] = df_valid.loc[:, 'is_attributed'].astype(np.int8)
df_test.loc[:, 'is_attributed'] = df_test.loc[:, 'is_attributed'].fillna(0).astype(np.int8)

# Chunking and target mean encodings

The training data will be subsampled in 10 random chunks for faster loading, feature selection and hyperparameter tuning. In addition target mean encodings are processed. To avoid overfitting those cross validation is used. Due to the large target imbalance in the dataset the used cross validation is stratified.

The data is saved in HDF5 files to allow faster data loading.

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)

In [None]:
df_train = downcast_dtypes(df_train)
df_valid = downcast_dtypes(df_valid)
df_test = downcast_dtypes(df_test)

In [None]:
from sklearn.model_selection import StratifiedKFold

cols_to_encode = ['app', 'device', 'os', 'channel', 'hour', 'timeframe_11', 'timeframe_17', 'category_mod16']
n_splits=10

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)

with pd.HDFStore('store_enc_chunks.h5',  mode='w') as store:
    
    for i, (train_index, valid_index) in enumerate(skf.split(df_train, df_train.is_attributed)):
        print('Starting iteration ' + str(i))
        
        # target mean encodings
        for col in cols_to_encode:
            print('\t Processing mean encoding for ' + col)
            df_train_gby_col = df_train.loc[train_index, [col, 'is_attributed']].groupby(col) \
                                   .is_attributed.mean()
            df_train.loc[valid_index, 'target_mean_' + col] = df_train.loc[valid_index, col].map(df_train_gby_col)
            df_train.loc[:, 'target_mean_' + col] = df_train.loc[:, 'target_mean_' + col].astype(np.float16)

            if i == 0:
                df_valid.loc[:, 'target_mean_' + col] = (df_valid.loc[:, col].map(df_train_gby_col) / n_splits)
                df_valid.loc[:, 'target_mean_' + col] = df_valid.loc[:, 'target_mean_' + col].astype(np.float16)
                
                df_test.loc[:, 'target_mean_' + col] = (df_test.loc[:, col].map(df_train_gby_col) / n_splits)
                df_test.loc[:, 'target_mean_' + col] = df_test.loc[:, 'target_mean_' + col].astype(np.float16)
            else:
                df_valid.loc[:, 'target_mean_' + col] += (df_valid.loc[:, col].map(df_train_gby_col) / n_splits)
                df_valid.loc[:, 'target_mean_' + col] = df_valid.loc[:, 'target_mean_' + col].astype(np.float16)
                
                df_test.loc[:, 'target_mean_' + col] += (df_test.loc[:, col].map(df_train_gby_col) / n_splits)
                df_test.loc[:, 'target_mean_' + col] = df_test.loc[:, 'target_mean_' + col].astype(np.float16)

        del df_train_gby_col
        gc.collect()

        # frequency features
        for col in ['ip', 'app', 'device', 'os', 'channel']:
            print('\t Processing frequency encoding for column ' + col)

            # Get counts, sums and frequency of is_attributed
            df_train_freq = pd.DataFrame({
                    'sums': df_train.loc[train_index, :].groupby(col)['is_attributed'].sum(),
                    'counts': df_train.loc[train_index, :].groupby(col)['is_attributed'].count()
            })
            df_train_freq.loc[:, 'freq'] = (df_train_freq.sums / df_train_freq.counts)
            df_train_freq.loc[:, 'freq'] = df_train_freq.loc[:, 'freq'].astype(np.float16)

            # If we have less than 3 observations, e.g. for an IP, then assume freq of 0
            df_train_freq.loc[df_train_freq.counts <= 3, 'freq'] = 0        

            df_train.loc[valid_index, col+'_freq'] = df_train.loc[valid_index, col].map(df_train_freq['freq'])
            df_train.loc[:, col+'_freq'] = df_train.loc[:, col+'_freq'].astype(np.float16)
            
            if i == 0:
                df_valid.loc[:, col+'_freq'] = (df_valid.loc[:, col].map(df_train_freq['freq']) / n_splits)
                df_valid.loc[:, col+'_freq'] = df_valid.loc[:, col+'_freq'].astype(np.float16)
                
                df_test.loc[:, col+'_freq'] = (df_test.loc[:, col].map(df_train_freq['freq']) / n_splits)
                df_test.loc[:, col+'_freq'] = df_test.loc[:, col+'_freq'].astype(np.float16)
            else:
                df_valid.loc[:, col+'_freq'] += (df_valid.loc[:, col].map(df_train_freq['freq']) / n_splits)
                df_valid.loc[:, col+'_freq'] = df_valid.loc[:, col+'_freq'].astype(np.float16)
                
                df_test.loc[:, col+'_freq'] += (df_test.loc[:, col].map(df_train_freq['freq']) / n_splits)
                df_test.loc[:, col+'_freq'] = df_test.loc[:, col+'_freq'].astype(np.float16)
                
        del df_train_freq
        gc.collect()
        
        df_train = downcast_dtypes(df_train)
        store.append('df_train_chunk' + str(i), df_train.loc[valid_index, :], 
                     data_columns=df_train.columns, format='table')
        
    df_valid = downcast_dtypes(df_valid)
    store.append('df_valid', df_valid, data_columns=df_valid.columns, format='table')
    
    df_test = downcast_dtypes(df_test)
    store.append('df_test', df_test, data_columns=df_test.columns, format='table')

df_train = df_train.drop(cols_to_encode, axis=1)
df_train = downcast_dtypes(df_train)

df_test = df_test.drop(cols_to_encode, axis=1)
df_test = downcast_dtypes(df_test)

df_train.head(n=2)

In [12]:
df_train.columns

Index(['app_nunique_gby_ip', 'app_nunique_gby_ip_device_os',
       'channel_count_gby_ip_app', 'channel_count_gby_ip_app_os',
       'channel_count_gby_ip_day_hour',
       'channel_count_gby_ip_day_timeframe_11',
       'channel_count_gby_ip_day_timeframe_17', 'channel_nunique_gby_app',
       'channel_nunique_gby_ip', 'click_time', 'day',
       'day_var_gby_ip_app_channel', 'device_nunique_gby_ip',
       'hour_mean_gby_ip_app_channel', 'hour_nunique_gby_ip_day',
       'hour_var_gby_ip_app_os', 'hour_var_gby_ip_day_channel', 'ip',
       'is_attributed', 'minute_of_day', 'os_count_gby_ip',
       'os_nunique_gby_ip_app', 'timeframe_11_mean_gby_ip_app_channel',
       'timeframe_11_var_gby_ip_app_os', 'timeframe_11_var_gby_ip_day_channel',
       'timeframe_17_mean_gby_ip_app_channel',
       'timeframe_17_var_gby_ip_app_os', 'timeframe_17_var_gby_ip_day_channel',
       'app_cumcount_gby_ip_device_os', 'os_cumcount_gby_ip', 'category',
       'next_click', 'next_click_shift', 'ip_