# Transaction.csv Analysis

In [0]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import gc
warnings.filterwarnings('ignore')

In [0]:
transaction_df = pd.read_csv('transaction.csv')
transaction_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,


In [0]:
transaction_df.shape

(590540, 17)

In [0]:
transaction_df.dtypes

TransactionID       int64
isFraud             int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
card1               int64
card2             float64
card3             float64
card4              object
card5             float64
card6              object
addr1             float64
addr2             float64
dist1             float64
dist2             float64
P_emaildomain      object
R_emaildomain      object
dtype: object

In [0]:
transaction_df.describe(include='all')

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain
count,590540.0,590540.0,590540.0,590540.0,590540,590540.0,581607.0,588975.0,588963,586281.0,588969,524834.0,524834.0,238269.0,37627.0,496084,137291
unique,,,,,5,,,,4,,4,,,,,59,60
top,,,,,W,,,,visa,,debit,,,,,gmail.com,gmail.com
freq,,,,,439670,,,,384767,,439938,,,,,228355,57147
mean,3282270.0,0.03499,7372311.0,135.027176,,9898.734658,362.555488,153.194925,,199.278897,,290.733794,86.80063,118.50218,231.855423,,
std,170474.4,0.183755,4617224.0,239.162522,,4901.170153,157.793246,11.336444,,41.244453,,101.741072,2.690623,371.872026,529.053494,,
min,2987000.0,0.0,86400.0,0.251,,1000.0,100.0,100.0,,100.0,,100.0,10.0,0.0,0.0,,
25%,3134635.0,0.0,3027058.0,43.321,,6019.0,214.0,150.0,,166.0,,204.0,87.0,3.0,7.0,,
50%,3282270.0,0.0,7306528.0,68.769,,9678.0,361.0,150.0,,226.0,,299.0,87.0,8.0,37.0,,
75%,3429904.0,0.0,11246620.0,125.0,,14184.0,512.0,150.0,,226.0,,330.0,87.0,24.0,206.0,,


In [0]:
for column in transaction_df.columns:
    num_nans = transaction_df[column].isna().sum()
    print('percentage of NaNs for {}: {:.3f}'.format(
        column, 
        num_nans / len(transaction_df) * 100
    ))

percentage of NaNs for TransactionID: 0.000
percentage of NaNs for isFraud: 0.000
percentage of NaNs for TransactionDT: 0.000
percentage of NaNs for TransactionAmt: 0.000
percentage of NaNs for ProductCD: 0.000
percentage of NaNs for card1: 0.000
percentage of NaNs for card2: 1.513
percentage of NaNs for card3: 0.265
percentage of NaNs for card4: 0.267
percentage of NaNs for card5: 0.721
percentage of NaNs for card6: 0.266
percentage of NaNs for addr1: 11.126
percentage of NaNs for addr2: 11.126
percentage of NaNs for dist1: 59.652
percentage of NaNs for dist2: 93.628
percentage of NaNs for P_emaildomain: 15.995
percentage of NaNs for R_emaildomain: 76.752


TransactionID, TransactionDT, TransactionAmt, ProductCD, card1 has no missing values.

dist2 has high no. of NaNs 

In [0]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 
#                 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
#                 elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
#                     df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

In [0]:
transaction_df = reduce_mem_usage(transaction_df)

Mem. usage decreased to 37.73 Mb (50.7% reduction)


# Data Preprocessing

In [0]:
# Label Encoding
# https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
for f in transaction_df.drop('isFraud', axis=1).columns:
    if transaction_df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(transaction_df[f].values))
        transaction_df[f] = lbl.transform(list(transaction_df[f].values)) 

In [0]:
y_df = transaction_df['isFraud']
x_df = transaction_df.drop(["isFraud", "dist2"], axis=1)

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_df, 
                                                    y_df, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [0]:
lgb_params = {
        "objective" : "binary",
        "metric" : "AUC",
        "max_depth" : 2,
        "num_leaves" : 2,
        "learning_rate" : 0.055,
        "bagging_fraction" : 0.3,
        "feature_fraction" : 0.15,
        "lambda_l1" : 5,
        "lambda_l2" : 5,
        "verbosity" : 1
    }

In [0]:
lgb_train = lgb.Dataset(x_train, label=y_train)
model = lgb.train(lgb_params, lgb_train, 100)
y_pred = model.predict(x_test)

In [0]:
roc_auc_score(y_test, y_pred)

0.7448983361665535