In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
class CustomAttributeAdder(BaseEstimator, TransformerMixin):
    
    
    def __init__(self, featurelist = ['scannedLineItemsTotal',
                                     'valuePerLineItem',
                                     'quantityModificationsPerLineItem',
                                     'totalScanTimeInSeconds*lineItemVoids',
                                     'totalScanTimeInSeconds*scansWithoutRegistration',
                                     'totalScanTimeInSeconds*scannedLineItemsTotal',
                                     'lineItemVoids*scansWithoutRegistration',
                                     'totalScanTimeInSeconds/trustLevel',
                                     'lineItemVoids/trustLevel',
                                     'scansWithoutRegistration/trustLevel',
                                     'scannedLineItemsTotal/trustLevel',
                                     'trustLevel_Log',
                                     'grandTotal_Log',
                                     'quantityModifications_Square',
                                     'scannedLineItemsTotal_Square']):
    
        # if you use "_featurelist" sklearn will not set this in gridSearch instead it sets keys of get_params which is "featurelist" 
        self.featurelist = featurelist
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        X['scannedLineItemsTotal'] = X['scannedLineItemsPerSecond'] * X['totalScanTimeInSeconds']
            
        if "valuePerLineItem" in self.featurelist:
            X['valuePerLineItem'] = X['grandTotal'] / X['scannedLineItemsTotal']
            
        if "quantityModificationsPerLineItem" in self.featurelist:
            X['quantityModificationsPerLineItem'] = X['quantityModifications'] / X['scannedLineItemsTotal']
        
        #interesting features from feature_engineering notebook
        #polynomial features
        
        if 'totalScanTimeInSeconds*lineItemVoids' in self.featurelist:
            X['totalScanTimeInSeconds*lineItemVoids'] = X['totalScanTimeInSeconds'] * X['lineItemVoids']
        
        if 'totalScanTimeInSeconds*scansWithoutRegistration' in self.featurelist:
            X['totalScanTimeInSeconds*scansWithoutRegistration'] = X['totalScanTimeInSeconds'] * X['scansWithoutRegistration']
        
        if 'totalScanTimeInSeconds*scannedLineItemsTotal' in self.featurelist:
            X['totalScanTimeInSeconds*scannedLineItemsTotal'] = X['totalScanTimeInSeconds'] * X['scannedLineItemsTotal']
        
        if 'lineItemVoids*scansWithoutRegistration' in self.featurelist:
            X['lineItemVoids*scansWithoutRegistration'] = X['lineItemVoids'] * X['scansWithoutRegistration']
        
        
        #division features
        #!!! Be carefull with division by 0 !!!
        #right now only divison by trustLevel, which is never zero
        if 'totalScanTimeInSeconds/trustLevel' in self.featurelist:
            X['totalScanTimeInSeconds/trustLevel'] = X['totalScanTimeInSeconds'] / X['trustLevel']
        
        if 'lineItemVoids/trustLevel' in self.featurelist:
            X['lineItemVoids/trustLevel'] = X['lineItemVoids'] / X['trustLevel']
        
        if 'scansWithoutRegistration/trustLevel' in self.featurelist:
            X['scansWithoutRegistration/trustLevel'] = X['scansWithoutRegistration'] / X['trustLevel']
        
        if 'scannedLineItemsTotal/trustLevel' in self.featurelist:
            X['scannedLineItemsTotal/trustLevel'] = X['scannedLineItemsTotal'] / X['trustLevel']
        
        
        #Log features
        if 'trustLevel_Log' in self.featurelist:
            X['trustLevel_Log'] = np.log(X['trustLevel'])
        
        if 'grandTotal_Log' in self.featurelist:
            X['grandTotal_Log'] = np.log(X['grandTotal'])
        
        #Square features
        if 'quantityModifications_Square' in self.featurelist:
            X['quantityModifications_Square'] = np.square(X['quantityModifications'])
        
        if 'scannedLineItemsTotal_Square' in self.featurelist:
            X['scannedLineItemsTotal_Square'] = np.square(X['scannedLineItemsTotal'])
        
        return X

In [11]:
test = CustomAttributeAdder()

In [14]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('train.csv', sep='|')
df_test = pd.read_csv('test.csv', sep='|')

In [15]:
test.transform(df_train)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,...,totalScanTimeInSeconds*scannedLineItemsTotal,lineItemVoids*scansWithoutRegistration,totalScanTimeInSeconds/trustLevel,lineItemVoids/trustLevel,scansWithoutRegistration/trustLevel,scannedLineItemsTotal/trustLevel,trustLevel_Log,grandTotal_Log,quantityModifications_Square,scannedLineItemsTotal_Square
0,5,1054,54.70,7,0,3,0.027514,0.051898,0.241379,0,...,30566.0,0,210.800000,1.400000,0.000000,5.800000,1.609438,4.001864,9,841.0
1,3,108,27.36,5,2,4,0.129630,0.253333,0.357143,0,...,1512.0,10,36.000000,1.666667,0.666667,4.666667,1.098612,3.309082,16,196.0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,...,19708.0,30,505.333333,1.000000,3.333333,4.333333,1.098612,4.129712,25,169.0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,...,51939.0,32,298.500000,1.333333,0.666667,4.833333,1.791759,4.525152,16,841.0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,...,11610.0,21,86.000000,0.600000,1.400000,5.400000,1.609438,4.400971,4,729.0
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077,1,...,20020.0,55,770.000000,11.000000,5.000000,26.000000,0.000000,2.406044,4,676.0
6,3,294,55.63,2,7,1,0.037415,0.189218,0.181818,0,...,3234.0,14,98.000000,0.666667,2.333333,3.666667,1.098612,4.018723,1,121.0
7,2,1545,22.80,0,8,4,0.006472,0.014757,0.000000,0,...,15450.0,0,772.500000,0.000000,4.000000,5.000000,0.693147,3.126761,16,100.0
8,6,962,65.44,7,0,2,0.028067,0.068025,0.259259,0,...,25974.0,0,160.333333,1.166667,0.000000,4.500000,1.791759,4.181134,4,729.0
9,2,725,41.08,10,2,4,0.037241,0.056662,0.370370,0,...,19575.0,20,362.500000,5.000000,1.000000,13.500000,0.693147,3.715521,16,729.0
