In [1]:
import random
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings("ignore")



In [2]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [3]:
# Merging the transaction and identity dataset

train_data = train_transaction.merge(train_identity, how='left', on='TransactionID')
del train_transaction, train_identity

test_data = test_transaction.merge(test_identity, how='left', on='TransactionID')
del test_transaction, test_identity

In [4]:
# Sorting the Dataframe by the TransactionDT column

train_data.sort_values(by='TransactionDT' ,inplace=True)
test_data.sort_values(by='TransactionDT' , inplace=True)

In [5]:
# Saving the merged train and test dataset into pickle format

train_data.to_pickle('merged_train.pkl')
test_data.to_pickle('merged_test.pkl')

del train_data, test_data

In [6]:
# Loading the merged train and test dataset

train_data = pd.read_pickle('merged_train.pkl')
test_data = pd.read_pickle('merged_test.pkl')

In [7]:
print("*"*45)
print("\n Train Data Shape : {} \n".format(train_data.shape))
print("\n Test Data Shape : {} \n".format(test_data.shape))
print("*"*45)

*********************************************

 Train Data Shape : (590540, 434) 


 Test Data Shape : (506691, 433) 

*********************************************


## Utility Functions
<br>

In [8]:
def cat_num_features(df):
    
    '''
        Utility Function to get the names of Categorical Features and 
        Numerical Features of the given Dataset.
    '''
    
    catf = []
    numf = []
    
    # Given Categorical Features 
    catf = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', \
            'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', \
            'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', \
            'DeviceType', 'DeviceInfo']
    catf+=['id_'+str(i) for i in range(12,39)]


    # Updating the Categorical Feature Names List based on the columns present in the dataframe
    catf = [feature for feature in catf if feature in df.columns.values]
    numf = [feature for feature in df.columns if feature not in catf and not feature == 'isFraud']
    
    return (catf, numf)  

In [9]:
def label_encode(X_train, X_test, catf):
  
  '''
    Utility Function to Encode Categorical Features.
  '''

  for f in catf:
    
    X_train[f] = X_train[f].astype(str)
    X_test[f] = X_test[f].astype(str)
    
    le = LabelEncoder()
    le.fit(X_train[f])
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    X_train[f] = le.transform(X_train[f])
    
    # Manually Encoding the CV and Test Dataset so as to avoid error for any category which is not present in train set
    
    # All the categories which are not present in train datset are encoded as -1
    
    X_test[f] = [-1 if mapping.get(v, -1)==-1 else mapping[v] for v in X_test[f].values ]

  return (X_train, X_test)

In [10]:
def normalize(X_train, X_test):
    '''
        Utility Function to scale the values of the Train, CV and Test Datasets between 0 and 1.
    '''
    
    for f in X_train.columns:

        min_val = X_train[f].min()
        max_val = X_train[f].max()
        
        X_train[f] = (X_train[f]-min_val)/(max_val-min_val)
        X_test[f] = (X_test[f]-min_val)/(max_val-min_val)
        
    return (X_train, X_test)

In [11]:
def predict_and_save(prediction, name):
    
    '''
        Utility Function to save the test data predictions locally.
    '''

    df = pd.DataFrame({'TransactionID':test_ids.reshape(-1), 'isFraud':prediction.reshape(-1)})
    df = df.sort_values('TransactionID')
    df.to_csv(name, index=False)

## Data Preparation
<br>

### Splitting the Dataset
<br>

In [12]:
X_train = train_data.drop(['isFraud', 'TransactionID'], axis=1)
y_train = train_data['isFraud']

X_test = test_data.drop(['TransactionID'], axis=1)
test_ids = test_data['TransactionID'].values

del train_data, test_data

In [13]:
print("*"*45)
print("\n Train Data Shape : {} \n".format(X_train.shape))
print("\n Test Data Shape : {} \n".format(X_test.shape))
print("*"*45)

*********************************************

 Train Data Shape : (590540, 432) 


 Test Data Shape : (506691, 432) 

*********************************************


#### Label Encoding Categorical Features

We will be creating **three sets**, one having imputed missing values to be used with models which do not handle missing values on their own and the other whose missing values will be imputed and also it will be scaled between 0 and 1 to be used with models like **Naive Bayes and Logisitic Regression** and the third which is to be used with model like **XgBoost** which can handle missing values on its own.

In [14]:
# Check columns in X_train and X_test
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)

# Align columns in X_train and X_test
common_columns = list(set(X_train.columns) & set(X_test.columns))
X_train = X_train[common_columns]
X_test = X_test[common_columns]

# Storing Categorical and Numerical Feature Names
catf, numf = cat_num_features(X_train)
categorical_feature_indices = [X_train.columns.get_loc(f) for f in catf]

Columns in X_train: Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)
Columns in X_test: Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id-31', 'id-32', 'id-33', 'id-34', 'id-35', 'id-36', 'id-37', 'id-38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)


In [15]:
# Impute missing values and label encode categorical features

# Imputing the missing values of Categorical Columns with "missing"
X_train[catf] = X_train[catf].fillna('missing')
X_test[catf] = X_test[catf].fillna('missing')

# Label Encoding Categorical Features
X_train, X_test = label_encode(X_train, X_test, catf)

In [16]:
# Set1 (Imputed and Normalized)
X_train1 = X_train.fillna(-999)
X_test1 = X_test.fillna(-999)
X_train1, X_test1 = normalize(X_train1, X_test1)

# Set2 (Imputed)
X_train2 = X_train.fillna(-999)
X_test2 = X_test.fillna(-999)

# Set3 (Raw)
X_train3 = X_train
X_test3 = X_test

del X_train, X_test

In [17]:
train1 = X_train1
train1['isFraud'] = y_train

test1 = X_test1

del X_train1, X_test1


train2 = X_train2
train2['isFraud'] = y_train

test2 = X_test2

del X_train2, X_test2


train3 = X_train3
train3['isFraud'] = y_train

test3 = X_test3

del X_train3, y_train, X_test3

In [18]:
y_train = train1.pop('isFraud')
_ = train2.pop('isFraud')
_ = train3.pop('isFraud')


X_train1 = train1
X_test1 = test1 

X_train2 = train2
X_test2 = test2 

X_train3 = train3
X_test3 = test3 

del train1, train2, train3

In [19]:
print("*"*60)
print("\n Train Dataset Set1 Shape : {} \n".format(X_train1.shape))
print("\n Test Dataset Set1 Shape : {} \n".format(X_test1.shape))
print("*"*60)
print("\n Train Dataset Set1 Shape : {} \n".format(X_train1.shape))
print("\n Test Dataset Set2 Shape : {} \n".format(X_test2.shape))
print("*"*60)
print("\n Train Dataset Set1 Shape : {} \n".format(X_train1.shape))
print("\n Test Dataset Set3 Shape : {} \n".format(X_test3.shape))
print("*"*60)

************************************************************

 Train Dataset Set1 Shape : (590540, 394) 


 Test Dataset Set1 Shape : (506691, 394) 

************************************************************

 Train Dataset Set1 Shape : (590540, 394) 


 Test Dataset Set2 Shape : (506691, 394) 

************************************************************

 Train Dataset Set1 Shape : (590540, 394) 


 Test Dataset Set3 Shape : (506691, 394) 

************************************************************


## Base Line Models

### 1. Naive Bayes

In [20]:
nb = GaussianNB(priors=[0.5,0.5]) 
nb.fit(X_train1, y_train)

In [21]:
nb_test_proba = nb.predict_proba(X_test1)[:,1]

In [22]:
predict_and_save(nb_test_proba, 'nb_pred_bl.csv')

### 2. Logistic Regression

In [23]:
lr = LogisticRegression(n_jobs = -1, class_weight = 'balanced', random_state = 3, solver='lbfgs', max_iter=3000) 
lr.fit(X_train1, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
lr_test_proba = lr.predict_proba(X_test1)[:,1]

In [25]:
predict_and_save(lr_test_proba, 'lr_pred_bl.csv')

### 3. Decision Tree

In [26]:
dt =  DecisionTreeClassifier(random_state=3, class_weight='balanced')
dt.fit(X_train2, y_train)

In [27]:
dt_test_proba = dt.predict_proba(X_test2)[:,1]

In [28]:
predict_and_save(dt_test_proba, 'dt_pred_bl.csv')

### 4. Random Forest

In [29]:
rf =  RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 3)
rf.fit(X_train2, y_train)

In [30]:
rf_test_proba = rf.predict_proba(X_test2)[:,1]

In [31]:
predict_and_save(rf_test_proba, 'rf_pred_bl.csv')

### 5. Adaptive Boosting

In [32]:
ab =  AdaBoostClassifier(random_state = 3)
ab.fit(X_train2, y_train)

In [33]:
ab_test_proba = ab.predict_proba(X_test2)[:,1]

In [34]:
predict_and_save(ab_test_proba, 'ab_pred_bl.csv')

### 6. Gradient Boosted Decision Tree (XGBoost)

In [35]:
weight = y_train.value_counts()[0]/y_train.value_counts()[1]

In [36]:
xgboost =  XGBClassifier(scale_pos_weight = weight, objective='binary:logistic', eval_metric = 'auc', random_state = 3, tree_method = 'gpu_hist', gpu_id=0)
xgboost.fit(X_train3, y_train)

In [37]:
xgboost_test_proba = xgboost.predict_proba(X_test3)[:,1]

In [38]:
predict_and_save(xgboost_test_proba, 'xgboost_pred_bl.csv')

## Conclusion

From the above results, it is quite clear that the Decision Tree based Ensemble is working best for our problem. Hence, from now on we will be using Tree based Ensemble only and will try to optimize the data and models to get more better score.