In [55]:
import pandas as pd

train = pd.read_csv("../../data/interim/train.csv")

train.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,target
0,-0.801242,0.863213,2.262153,0.163269,0.439699,0.099186,0.509842,-1.457983,-1.11456,1.121855,...,0.172138,-0.919713,-0.294969,0.015337,0.096032,2.257601,-0.559143,-0.427364,6.87,0
1,1.888375,0.302642,-0.461681,3.715281,0.564265,1.193742,-0.297232,0.249037,-0.585925,1.436384,...,0.107427,0.559454,0.04016,0.303889,0.174219,0.106501,-0.00799,-0.053234,1.51,0
2,-0.175369,1.23042,-1.154765,-0.848643,1.160763,0.012115,0.541704,0.498493,-0.480224,-0.905377,...,-0.303549,-0.876726,0.02458,-0.442627,-0.277732,0.156373,0.093035,0.00269,9.28,0
3,-1.133432,0.858143,1.516667,-1.288041,-0.176508,-1.06705,0.743314,-0.152737,0.109759,-0.344643,...,-0.106558,-0.123656,0.049412,0.436233,-0.23572,0.713567,0.252506,0.262031,33.34,0
4,-1.592785,-0.452254,1.289814,-4.452088,-1.024463,-1.141025,0.068764,0.320863,0.252155,-1.962361,...,-0.202288,-0.366467,-0.060495,-0.167117,0.781667,-1.031136,0.198773,0.015325,118.68,0


In [47]:
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')  # ‘uniform’, ‘quantile’, ‘kmeans’
train['amount_discretized'] = kbins.fit_transform(train[['amount']].values)
agg_values = train.groupby(by=['amount_discretized']).mean()
columns_to_agg = ['v1']
agg_values = agg_values[columns_to_agg]
agg_values.columns = [x + "_mean_given_amount" for x in agg_values.columns]
train = train.merge(agg_values, how='left', on=['amount_discretized'])
train.drop(['amount_discretized'], axis=1, inplace=True)
print(train.shape)
train.head()


(244934, 32)


Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v23,v24,v25,v26,v27,v28,amount,target,v1_mean_given_amount_x,v1_mean_given_amount_y
0,-0.801242,0.863213,2.262153,0.163269,0.439699,0.099186,0.509842,-1.457983,-1.11456,1.121855,...,-0.294969,0.015337,0.096032,2.257601,-0.559143,-0.427364,6.87,0,-0.010115,-0.010115
1,1.888375,0.302642,-0.461681,3.715281,0.564265,1.193742,-0.297232,0.249037,-0.585925,1.436384,...,0.04016,0.303889,0.174219,0.106501,-0.00799,-0.053234,1.51,0,0.093088,0.093088
2,-0.175369,1.23042,-1.154765,-0.848643,1.160763,0.012115,0.541704,0.498493,-0.480224,-0.905377,...,0.02458,-0.442627,-0.277732,0.156373,0.093035,0.00269,9.28,0,-0.010115,-0.010115
3,-1.133432,0.858143,1.516667,-1.288041,-0.176508,-1.06705,0.743314,-0.152737,0.109759,-0.344643,...,0.049412,0.436233,-0.23572,0.713567,0.252506,0.262031,33.34,0,0.18461,0.18461
4,-1.592785,-0.452254,1.289814,-4.452088,-1.024463,-1.141025,0.068764,0.320863,0.252155,-1.962361,...,-0.060495,-0.167117,0.781667,-1.031136,0.198773,0.015325,118.68,0,-0.42203,-0.42203


In [50]:
from sklearn.base import BaseEstimator, TransformerMixin
class AggByAmount(BaseEstimator, TransformerMixin):
    # Inputs: bins, encode, strategy ('uniform', 'quantile', 'kmeans'), number of top features, mean/max/min
    # Top features order: ['v1', 'v4', 'v10', 'v7', 'v18', 'v11', 'v20', 'amount', 'v3', 'v16', 'v13', 'v14', 'v8', 'v9', 'v19', 'v2', 'v5', 'v12', 'v26', 'v24', 'v25', 'v27', 'v17', 'v22', 'v23', 'v6', 'v15', 'v21']
    def __init__(self, n_bins=5, encode='ordinal', strategy='quantile', columns_to_agg=['v1']):
        self.n_bins = n_bins
        self.encode = encode
        self.strategy = strategy
        self.columns_to_agg = columns_to_agg
        self.kbins = None
        self.initial_columns = None
    def fit(self, X, y=None):
        self.kbins = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy)
        self.kbins.fit(X[['amount']].values)
        self.initial_columns = list(X.columns)
        return self
    def transform(self, X, y=None):
        X['amount_discretized'] = self.kbins.transform(X[['amount']].values)
        agg_values = X.groupby(by=['amount_discretized']).mean()
        agg_values = agg_values[self.columns_to_agg]
        agg_values.columns = [x + "_mean_given_amount" for x in agg_values.columns]
        X = X.merge(agg_values, how='left', on=['amount_discretized'])
        X.drop(self.initial_columns + ['amount_discretized'], axis=1, inplace=True)
        return X


In [51]:
agg_by_amount = AggByAmount()

In [52]:
agg_by_amount.fit(train)

AggByAmount(columns_to_agg=['v1'], encode='ordinal', n_bins=5,
            strategy='quantile')

In [54]:
agg_by_amount.transform(train).head()

Unnamed: 0,v1_mean_given_amount
0,-0.010115
1,0.093088
2,-0.010115
3,0.18461
4,-0.42203
