In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [3]:
train_tr = pd.read_csv("train_transaction.csv")
test_tr = pd.read_csv("test_transaction.csv")
train_id = pd.read_csv("train_identity.csv")
test_id = pd.read_csv("test_identity.csv")
sample_submission_file = pd.read_csv("sample_submission.csv")

In [4]:
missing_values_count = train_tr.isnull().sum()
missing_values_count

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
V335              508189
V336              508189
V337              508189
V338              508189
V339              508189
Length: 394, dtype: int64

### reduce memory

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
train_tr = reduce_mem_usage(train_tr)
train_id = reduce_mem_usage(train_id)
test_tr = reduce_mem_usage(test_tr)
test_id= reduce_mem_usage(test_id)

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)


In [7]:
#merge
train = train_tr.merge(train_id, how="left", on="TransactionID")
test = test_tr.merge(test_id, how="left", on="TransactionID")

del train_tr, train_id, test_tr, test_id

In [8]:
def get_missing_values(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
    return missing_data

In [9]:
missing_train = get_missing_values(train)
missing_train.head(100).T

Unnamed: 0,id_24,id_25,id_07,id_08,id_21,id_26,id_27,id_23,id_22,dist2,...,V266,V267,V268,V269,V273,V274,V275,V276,V265,V260
total,585793.0,585408.0,585385.0,585385.0,585381.0,585377.0,585371.0,585371.0,585371.0,552913.0,...,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0
percent,0.991962,0.99131,0.991271,0.991271,0.991264,0.991257,0.991247,0.991247,0.991247,0.936284,...,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134


In [10]:
def top_missing_cols(df,n=45,thresh=86.4):

    
    dff = (df.isnull().sum()/df.shape[0])*100
    dff = dff.reset_index()
    dff.columns = ['col','missing_percent']
    dff = dff.sort_values(by=['missing_percent'],ascending=False).reset_index(drop=True)
    print(f'There are {df.isnull().any().sum()} columns in this dataset with missing values.')
    print(f'There are {dff[dff["missing_percent"] > thresh].shape[0]} columns with missing percent values than {thresh}%')
    if n:
        return dff.head(n)
    else:
        return dff

top_missing_cols(train, thresh=86.4)

There are 414 columns in this dataset with missing values.
There are 27 columns with missing percent values than 86.4%


Unnamed: 0,col,missing_percent
0,id_24,99.196159
1,id_25,99.130965
2,id_07,99.12707
3,id_08,99.12707
4,id_21,99.126393
5,id_26,99.125715
6,id_27,99.124699
7,id_23,99.124699
8,id_22,99.124699
9,dist2,93.628374


In [11]:
A = (np.sum(pd.isnull(train)).sort_values(ascending=False)/len(train))*100
Removed_col = A[A>0.85].index
train.drop(columns=Removed_col, inplace=True)

In [12]:
A = (np.sum(pd.isnull(test)).sort_values(ascending=False)/len(test))*100
Removed_col = A[A>0.85].index
test.drop(columns=Removed_col, inplace=True)

In [13]:
test.shape

(506691, 96)

In [14]:
train.shape

(590540, 111)

In [15]:
train.fillna(train.mode())

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.500000,W,13926,150.0,discover,142.0,credit,...,0.0,0.0000,0.0000,0.0000,0.0,117.0,0.0,0.000000,0.000000,0.000000
1,2987001,0,86401,29.000000,W,2755,150.0,mastercard,102.0,credit,...,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000000,0.000000,0.000000
2,2987002,0,86469,59.000000,W,4663,150.0,visa,166.0,debit,...,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000000,0.000000,0.000000
3,2987003,0,86499,50.000000,W,18132,150.0,mastercard,117.0,debit,...,135.0,0.0000,0.0000,0.0000,50.0,1404.0,790.0,0.000000,0.000000,0.000000
4,2987004,0,86506,50.000000,H,4497,150.0,mastercard,102.0,credit,...,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.000000,W,6550,150.0,visa,226.0,debit,...,0.0,47.9375,47.9375,47.9375,0.0,0.0,0.0,0.000000,0.000000,0.000000
590536,3577536,0,15811049,39.500000,W,10444,150.0,mastercard,224.0,debit,...,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000000,0.000000,0.000000
590537,3577537,0,15811079,30.953125,W,12037,150.0,mastercard,224.0,debit,...,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000000,0.000000,0.000000
590538,3577538,0,15811088,117.000000,W,7826,150.0,mastercard,224.0,debit,...,117.0,317.5000,669.5000,317.5000,0.0,2234.0,0.0,0.000000,0.000000,0.000000


In [16]:
test.fillna(test.mode())

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card3,card4,card6,C1,C2,...,V309,V310,V311,V312,V316,V317,V318,V319,V320,V321
0,3663549,18403224,31.953125,W,10409,150.0,visa,debit,6.0,6.0,...,0.0000,47.950001,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,18403263,49.000000,W,4272,150.0,visa,debit,3.0,2.0,...,0.0000,280.000000,0.0000,77.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,18403310,171.000000,W,4476,150.0,visa,debit,2.0,2.0,...,0.0000,1058.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,18403310,285.000000,W,10989,150.0,visa,debit,5.0,2.0,...,0.0000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,18403317,67.937500,W,18018,150.0,mastercard,debit,6.0,6.0,...,67.9375,183.850006,67.9375,67.949997,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,4170235,34214279,94.687500,C,13832,185.0,mastercard,debit,1.0,1.0,...,0.0000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
506687,4170236,34214287,12.171875,C,3154,185.0,mastercard,debit,1.0,3.0,...,0.0000,31.723700,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
506688,4170237,34214326,49.000000,W,16661,150.0,visa,debit,1.0,1.0,...,0.0000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
506689,4170238,34214337,202.000000,W,16621,150.0,mastercard,debit,1.0,1.0,...,0.0000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Select categorical columns
categorical_cols = [cname for cname in train.columns if
                   train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in train.columns if 
                  train[cname].dtype in ['int8', 'int16', 'int32', 'float16', 'float32']]

In [23]:
from imblearn import over_sampling, under_sampling
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [24]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('scale', StandardScaler())])


# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('onehot', OneHotEncoder(dtype=np.int8, handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [25]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card3,card4,card6,C1,C2,...,V309,V310,V311,V312,V316,V317,V318,V319,V320,V321
0,3663549,18403224,31.953125,W,10409,150.0,visa,debit,6.0,6.0,...,0.0,47.950001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,18403263,49.0,W,4272,150.0,visa,debit,3.0,2.0,...,0.0,280.0,0.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,18403310,171.0,W,4476,150.0,visa,debit,2.0,2.0,...,0.0,1058.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,18403310,285.0,W,10989,150.0,visa,debit,5.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,18403317,67.9375,W,18018,150.0,mastercard,debit,6.0,6.0,...,67.9375,183.850006,67.9375,67.949997,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,150.0,mastercard,117.0,debit,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
train.dtypes

TransactionID       int32
isFraud              int8
TransactionDT       int32
TransactionAmt    float16
ProductCD          object
                   ...   
V317              float32
V318              float32
V319              float32
V320              float32
V321              float32
Length: 111, dtype: object

In [28]:
train = train[train['TransactionAmt'] < 30000]

In [29]:
train_dummy = pd.get_dummies(train, drop_first = True)

In [30]:
train_dummy.drop(columns = "TransactionID", inplace = True)

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_dummy.drop(columns='isFraud'), train_dummy['isFraud'], test_size=0.25, random_state=0)

In [32]:
import xgboost as xgb
import time
plt.style.use('seaborn-whitegrid')
import warnings
warnings.simplefilter("ignore")
import lightgbm as lgb
colors = [x['color'] for x in plt.rcParams['axes.prop_cycle']]

In [33]:
params = {'num_leaves': 260,
          'min_child_samples': 85,
          'objective': 'binary',
          'max_depth': 15,
          'learning_rate': 0.05,
          "boosting_type": "gbdt",
          "subsample_freq": 3,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          'colsample_bytree': 0.9,
          'is_unbalance':True

         }

d_train = lgb.Dataset(X_train, y_train)
d_valid = lgb.Dataset(X_valid, y_valid)
    
bst = lgb.train(params, d_train, num_boost_round= 400, valid_sets=[d_train, d_valid], verbose_eval=50, early_stopping_rounds=50)

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.962996	valid_1's auc: 0.927567
[100]	training's auc: 0.975901	valid_1's auc: 0.938226
[150]	training's auc: 0.983277	valid_1's auc: 0.943843
[200]	training's auc: 0.987888	valid_1's auc: 0.946671
[250]	training's auc: 0.991044	valid_1's auc: 0.94825
[300]	training's auc: 0.993459	valid_1's auc: 0.949594
[350]	training's auc: 0.995233	valid_1's auc: 0.950991
[400]	training's auc: 0.996508	valid_1's auc: 0.95211
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.996508	valid_1's auc: 0.95211


In [34]:
test = pd.get_dummies(test, drop_first=True)
#test_dummy.info()
float_col = test.select_dtypes(include='float64').columns
test[float_col] = test[float_col].astype("float32")

In [35]:
pred = bst.predict(test,predict_disable_shape_check=True)

In [36]:
pred

array([0.49176454, 0.74873629, 0.60914568, ..., 0.12195884, 0.12195884,
       0.21085466])

In [37]:
submission = pd.read_csv('sample_submission.csv')
submission['isFraud'] = pred
submission.to_csv('submission.csv',index=False)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.491765
1,3663550,0.748736
2,3663551,0.609146
3,3663552,0.168105
4,3663553,0.087882
