In [1]:
import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as df
#from dask_ml.impute import SimpleImputer
#from dask_ml.xgboost import XGBClassifier
#from dask_ml.preprocessing import LabelEncoder
#from dask_ml.model_selection import KFold
#from dask_ml.linear_model import LogisticRegression
from dask.distributed import Client, LocalCluster
from dask import compute
from xgboost import XGBClassifier
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve, auc, roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from skopt import BayesSearchCV
from scipy.stats import ttest_ind, ttest_rel
from imblearn.over_sampling import SMOTENC

from collections import namedtuple, Counter
from recordtype import recordtype
import pickle

sns.set(style="darkgrid")
numpy.set_printoptions(suppress=True)


In [2]:
SEED = 0
JOBS = 10

In [3]:
cluster = LocalCluster(n_workers=JOBS)
client = Client(cluster)

# Data Cleaning

In [4]:
train_transaction = df.read_csv('./data/train_transaction.csv').set_index('TransactionID')
train_identity = df.read_csv('./data/train_identity.csv').set_index('TransactionID')

In [5]:
train_transaction.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 393 entries, isFraud to V339
dtypes: object(14), float64(376), int64(3)

In [6]:
train_identity.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 40 entries, id_01 to DeviceInfo
dtypes: object(17), float64(23)

In [7]:
X_train = train_transaction.join(train_identity, how='left')

In [14]:
X_train.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 433 entries, isFraud to DeviceInfo
dtypes: object(31), float64(399), int64(3)

In [9]:
X_train.to_csv('./data/X_train/*.csv', index=True, header=True)

['/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/00.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/01.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/02.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/03.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/04.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/05.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/06.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/07.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/08.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/09.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/10.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_train/11.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_t

In [11]:
test_transaction = df.read_csv('./data/test_transaction.csv').set_index('TransactionID')
test_identity = df.read_csv('./data/test_identity.csv').set_index('TransactionID')

In [12]:
test_transaction.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 392 entries, TransactionDT to V339
dtypes: object(14), float64(376), int64(2)

In [13]:
test_identity.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 40 entries, id-01 to DeviceInfo
dtypes: object(17), float64(23)

In [15]:
X_test = test_transaction.join(test_identity, how='left')

In [16]:
X_test.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 432 entries, TransactionDT to DeviceInfo
dtypes: object(31), float64(399), int64(2)

In [18]:
[column for column in X_test.columns if column not in X_train.columns]

['id-01',
 'id-02',
 'id-03',
 'id-04',
 'id-05',
 'id-06',
 'id-07',
 'id-08',
 'id-09',
 'id-10',
 'id-11',
 'id-12',
 'id-13',
 'id-14',
 'id-15',
 'id-16',
 'id-17',
 'id-18',
 'id-19',
 'id-20',
 'id-21',
 'id-22',
 'id-23',
 'id-24',
 'id-25',
 'id-26',
 'id-27',
 'id-28',
 'id-29',
 'id-30',
 'id-31',
 'id-32',
 'id-33',
 'id-34',
 'id-35',
 'id-36',
 'id-37',
 'id-38']

In [21]:
X_test.to_csv('./data/X_test/*.csv', index=True, header=X_train.columns.drop('isFraud'))

['/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/00.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/01.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/02.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/03.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/04.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/05.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/06.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/07.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/08.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/09.csv',
 '/home/vladimir/PycharmProjects/IEEE-CISFraudDetection/data/X_test/10.csv']

In [4]:
types={'DeviceInfo': 'object',
       'DeviceType': 'object',
       'R_emaildomain': 'object',
       'id_12': 'object',
       'id_15': 'object',
       'id_16': 'object',
       'id_28': 'object',
       'id_29': 'object',
       'id_31': 'object',
       'id_35': 'object',
       'id_36': 'object',
       'id_37': 'object',
       'id_38': 'object',
       'id_23': 'object',
       'id_27': 'object',
       'id_30': 'object',
       'id_33': 'object',
       'id_34': 'object'}

In [5]:
X_train = df.read_csv('./data/X_train/*.csv', dtype=types).set_index('TransactionID')

In [6]:
X_train['isFraud'].value_counts().compute()

0    569877
1     20663
Name: isFraud, dtype: int64

In [7]:
X_train_pos = X_train.loc[X_train['isFraud']==1]
X_train_neg = X_train.loc[X_train['isFraud']==0].sample(frac=0.05, random_state=SEED)

In [8]:
X_train_sample = df.concat([X_train_pos, X_train_neg])

In [9]:
X_train_sample = X_train_sample.compute()

In [None]:
X_train_sample.to_csv('./data/X_train_sample.csv') 