In [None]:
# Binary Classification with Sonar Dataset: Baseline
from pandas import read_csv, read_pickle
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Concatenate, Input, Dense, Embedding, Flatten, Dropout, BatchNormalization, SpatialDropout1D
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import  Adam
import tensorflow.keras.backend as k
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import tensorflow as tf
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def strLower(s):
    return str(s).lower()

In [None]:
# load dataset
train = read_pickle("input/train_prepared.pickle")
test = read_pickle("input/test_prepared.pickle")
sample_submission = read_csv('./input/sample_submission.csv', index_col='TransactionID')
features = read_csv("input/feature_importances0.csv", index_col = 0)

In [None]:
train=train.sort_values(by=['TransactionID'])
train=train.set_index('TransactionID')
train=train.drop(['TransactionDT'],axis=1)
train=train.drop(['Unnamed: 0'],axis=1)

test=test.sort_values(by=['TransactionID'])
test=test.set_index('TransactionID')
test=test.drop(['TransactionDT'],axis=1)
test=test.drop(['Unnamed: 0'],axis=1)

In [None]:
#count the number of nulls per row
train['numNaN']=train.isna().sum(axis=1)
test['numNaN']=test.isna().sum(axis=1)

In [None]:
#change all strings in columns to lowercase
for i in range(len(train.dtypes.values)):
    if "object" in str(train.dtypes.values[i]):
        col = train.columns[i]
        train[col]=train[col].apply(strLower)
        test[col]=test[col].apply(strLower)

In [None]:
#train=train.drop(['ProductCD', 'addr1', 'M2'],axis=1)
#test=test.drop(['ProductCD', 'addr1', 'M2'],axis=1)

#categorise each column into either continuous or discrete
######browser + browser version tgt and OS and OS version
continuous = ['TransactionAmt','dist1','numNaN'] + list(train.filter(regex='^C[0-9]')) + list(train.filter(regex='^D[0-9]')) + list(train.filter(regex='^V'))
discrete = ['ProductCD','addr1','addr2','P_emaildomain','R_emaildomain',
            'DeviceType', 'DeviceInfo', 'OS_id_30', 'version_id_30', 'browser',
 'b_version',] + list(train.filter(regex='^card')) + list(train.filter(regex='^M')) + list(train.filter(regex='^id_'))

top_50 = ['card1', 'card2', 'addr1', 'TransactionAmt', 'D15', 'dist1', 'D4', 'D2', 'D10', 'card5', 'D11', 'C13', 'D1', 'id_02', 'P_emaildomain', 'D5', 'id_20', 'D3', 'id_19', 'C1', 'D8', 'C2', 'b_version', 'V307', 'V310', 'C14', 'C6', 'C11', 'V313', 'C9', 'V127', 'id_13', 'V130', 'D9', 'id_06', 'V315', 'V314', 'M4', 'V308', 'R_emaildomain', 'DeviceInfo', 'id_05', 'M5', 'V312', 'C5', 'card4', 'id_33', 'id_01', 'M6', 'V317']

dropped = ['TransactionAmt', 'dist1', 'C1', 'C5', 'C7', 'C9', 'C13', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V12', 'V13', 'V15', 'V17', 'V19', 'V20', 'V29', 'V31', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V44', 'V45', 'V46', 'V47', 'V48', 'V50', 'V51', 'V53', 'V54', 'V56', 'V57', 'V59', 'V61', 'V62', 'V63', 'V64', 'V69', 'V71', 'V73', 'V75', 'V76', 'V78', 'V79', 'V80', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V92', 'V95', 'V96', 'V99', 'V100', 'V130', 'V131', 'V138', 'V139', 'V140', 'V141', 'V142', 'V144', 'V146', 'V147', 'V148', 'V151', 'V152', 'V157', 'V161', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V180', 'V181', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V194', 'V195', 'V197', 'V198', 'V200', 'V201', 'V205', 'V206', 'V208', 'V214', 'V217', 'V218', 'V220', 'V221', 'V223', 'V224', 'V226', 'V227', 'V228', 'V229', 'V230', 'V234', 'V235', 'V236', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V245', 'V246', 'V247', 'V248', 'V250', 'V252', 'V255', 'V257', 'V258', 'V260', 'V261', 'V262', 'V263', 'V266', 'V267', 'V270', 'V276', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V294', 'V302', 'V303', 'V310', 'V312', 'V313', 'V314', 'V325', 'V326', 'V327', 'V328', 'V329', 'V334']
lgbm_feature = list(features.feature)

continuous = [x for x in continuous if x in top_50]
discrete = [x for x in discrete if x in top_50]

In [None]:
discrete

In [None]:
%%time
#Fill nan with zeroes for NN for continuos values
#Fill nan with "other" for NN for discrete values
def fill_nan(df):
    for x in list(df.columns.values):
        if x in continuous:
            df[x] = df[x].fillna(0)
            
        elif x in discrete:
            df[x] = df[x].replace("nan", "other")
            df[x] = df[x].replace(np.nan, "other")
    return df

train=fill_nan(train)
test=fill_nan(test)

In [None]:
#Label encoding
label_counts = {}
for col in tqdm.tqdm(train.columns):
    if col in discrete:
        le = LabelEncoder()
        le.fit(list(train[col].values) + list(test[col].values))
        train[col] = le.transform(list(train[col].values))
        test[col] = le.transform(list(test[col].values))
        label_counts[col]=len(list(le.classes_)) + 1

In [None]:
for col in continuous:
    scaler = StandardScaler()
    if train[col].max() > 100 and train[col].min() >= 0:
        train[col] = np.log1p(train[col])
        test[col] = np.log1p(test[col])
    scaler.fit(np.concatenate([train[col].values.reshape(-1,1), test[col].values.reshape(-1,1)]))
    train[col] = scaler.transform(train[col].values.reshape(-1,1))
    test[col] = scaler.transform(test[col].values.reshape(-1,1))

In [None]:
target = 'isFraud'

In [None]:
#split training set into training and validation set
train_set, valid_set = train_test_split(train, test_size = 0.2, random_state = 4041, shuffle = False)
folds = StratifiedKFolds(n_splits=5,shuffle=False,random_state=4041)

In [None]:
#Convert each column in the discrete list into a tensor and append into a list
def create_model():
    k.clear_session()
    categorical_inputs = []
    for cat in discrete:
        categorical_inputs.append(Input(shape=[1], name=cat))

    #connect each embedding for each columns with their respective input
    categorical_embeddings = []
    for i, cat in enumerate(discrete):
        categorical_embeddings.append(
            Embedding(label_counts[cat], int(np.log1p(label_counts[cat]) + 1), name = cat + "_embed")(categorical_inputs[i]))

    categorical_logits = Concatenate(name = "categorical_conc")([Flatten()(SpatialDropout1D(.1)(cat_emb)) for cat_emb in categorical_embeddings])

    #Convert continuous columns into tensors
    numerical_inputs = Input(shape=[train[continuous].shape[1]], name = 'continuous')
    numerical_logits = Dropout(.1)(numerical_inputs)

    #Join in the 2 logits together to form 1 layer
    x = Concatenate()([categorical_logits, numerical_logits,])

    #Build the hidden layer and output to 1 node
    x = Dense(200, activation = 'relu')(x)
    x = Dropout(.2)(x)
    x = Dense(100, activation = 'relu')(x)
    x = Dropout(.2)(x)
    out = Dense(1, activation = 'sigmoid')(x)

    model = Model(inputs=categorical_inputs + [numerical_inputs],outputs=out)
    loss = "binary_crossentropy"
    model.compile(optimizer=Adam(lr = 0.01), loss = loss)
    return model

In [None]:
discrete_backup = deepcopy(discrete)
continuous_backup = deepcopy(continuous)

In [None]:
def get_input_features(df):
    X = {'continuous':np.array(df[continuous])}
    for cat in discrete:
        X[cat] = np.array(df[cat])
    return X

In [None]:
X_test = get_input_features(test)
for fold,(trn_idx,test_idx) in enumerate(folds.split(train,target)):
    train_set = train.iloc[trn_idx]
    valid_set = train.iloc[test_idx]
    X_train = get_input_features(train_set)
    X_valid = get_input_features(valid_set)
    y_train = train_set[target]
    y_valid = valid_set[target]

    model = create_model()

    best_score = 0
    patience = 0

    for i in range(100):
        if patience < 3:
            hist = model.fit(X_train, y_train, validation_data = (X_valid,y_valid), batch_size = 8000, epochs = 1, verbose = 1)
            valid_preds = model.predict(X_valid, batch_size =  8000, verbose = True)
            score = roc_auc_score(y_valid, valid_preds)
            print(score)
            if score > best_score:
                model.save_weights("best_model.h5")
                best_score = score
                patience = 0
            else:
                patience += 1

In [None]:
model = create_model()

In [None]:
model.load_weights("best_model.h5")

In [None]:
X_valid = get_input_features(valid_set)
X_test = get_input_features(test)
valid_preds = model.predict(X_valid, batch_size = 500, verbose = True)
score = roc_auc_score(y_valid, valid_preds)
print(score)

In [None]:
hist = model.fit(X_valid,y_valid, batch_size = 8000, epochs = 3, verbose = 1)

In [None]:
predictions = model.predict(X_test, batch_size = 2000, verbose = True)

In [None]:
sample_submission[target] = predictions
sample_submission.to_csv('prediction.csv')

In [None]:
!kaggle competitions submit -c ieee-fraud-detection -f prediction.csv -m "Message"

In [None]:
fpr, tpr, _ = roc_curve(y_valid, valid_preds)
plt.plot(fpr, tpr, marker='.', label='nn prediction')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.savefig("roc.png")

In [None]:
continuous
