In [13]:
import pandas as pd
import numpy as np
import farmhash
import re
import os
import joblib
from preprocessing import DateDecomp,Aggregator,AggAmtperTxn,FarmHash,FraudFreq,ValueLength,WeekCategory,PurchaseType,SelectFeatures,ReplaceNaN

from feature_engine.encoding import (
    OneHotEncoder
)
from feature_engine.transformation import (
    LogTransformer
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline

In [10]:
###### Load data ##########

train = pd.read_csv("fraudTrain.csv")
train_target = train.is_fraud
train.drop(columns = ["Unnamed: 0","is_fraud"], inplace = True)

test = pd.read_csv("fraudTest.csv")
test_target = test.is_fraud
test.drop(columns = ["Unnamed: 0","is_fraud"], inplace = True)


In [11]:
######## Define Variables####

DATE_TRANSFORMATION = ["trans_date_trans_time","dob"]
AVG_DAY_AMT_AGGREGATION =  ["cc_num","gender"]
STD_YEAR_AMT_AGGREGATION = ["category"]
AMT_PER_TXN_DAY_AGGREGATION = ["cc_num"]
FRAUD_FREQ_MAPPER = ["trans_date_trans_time_hour"]
FARMHASH_MAPPER = ["category"]
LOG_TRANSFORM = ['cc_num_mean_caldate_sum_amt', 'gender_mean_caldate_sum_amt', 'category_std_year_amt_per_txn', 'cc_num_mean_caldate_amt_per_txn']
COLS_SELECTION = ['category', 'amt', 'trans_date_trans_time_hour', 'dob_year', 'cc_num_mean_caldate_sum_amt', 'gender_mean_caldate_sum_amt', 'category_std_year_amt_per_txn', 'cc_num_mean_caldate_amt_per_txn', 'trans_date_trans_time_hour_fraud_freq']
SEED = 4

In [14]:
######## Pipelinet####

pipe = Pipeline([
    
    ("txn_date_decom",DateDecomp(col = DATE_TRANSFORMATION))
    
      
    ,("avg_txn_amount",Aggregator(partition_col=AVG_DAY_AMT_AGGREGATION,date_col=['trans_date_trans_time_caldate'],agg_col="amt",agg_value="sum"))
    
    ,("std_amt_per_txn",AggAmtperTxn(partition_col=STD_YEAR_AMT_AGGREGATION,date_col=['trans_date_trans_time_year'],agg_type='std'))
    
    
    ,("amt_per_txn",AggAmtperTxn(partition_col=AMT_PER_TXN_DAY_AGGREGATION))
    
    
    ,("fraud_freq",FraudFreq(col = FRAUD_FREQ_MAPPER))
    
    
    ,("hash_value",FarmHash(col = FARMHASH_MAPPER))
    
    
   ,("log_transform",LogTransformer(variables=LOG_TRANSFORM  ))
 
   ,('normalizer', SklearnTransformerWrapper(transformer=MinMaxScaler()))
   ,("replace_nan",ReplaceNaN())
   ,("selector",SelectFeatures(COLS_SELECTION))
   ,("estimator",DecisionTreeClassifier(random_state=SEED,class_weight={0:0,1:2}))])


pipe.fit(train,train_target)


predictions = pipe.predict(test)
test_score = recall_score(test_target,predictions)

print(test_score)

  
  
  
  


1.0
