In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
import lightgbm as lgb
import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import multiprocessing
import gc

In [2]:
files = ['input/test_identity.csv', 
         'input/test_transaction.csv',
         'input/train_identity.csv',
         'input/train_transaction.csv',
         'input/sample_submission.csv']

def load_data(file):
    return pd.read_csv(file)

# with multiprocessing.Pool() as pool:
#     test_id, test_tr, train_id, train_tr, sub = pool.map(load_data, files)
test_id = load_data('input/test_identity.csv')
test_tr = load_data('input/test_transaction.csv')
train_id = load_data('input/train_identity.csv')
train_tr = load_data('input/train_transaction.csv')
sub = load_data('input/sample_submission.csv')

In [3]:
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

nameMap = {}
for col in test.columns:
    if '-' in col:
        nameMap[col] = col.replace('-','_')
test.rename(columns=nameMap, inplace=True)

del test_id, test_tr, train_id, train_tr
gc.collect()

0

In [4]:
train['id_30'] = train['id_30'].str.replace('Mac OS X', 'MacOSX')
train['OS_id_30'] = train['id_30'].str.split(' ', expand=True)[0]
train['version_id_30'] = train['id_30'].str.split(' ', expand=True)[1]

test['id_30'] = test['id_30'].str.replace('Mac OS X', 'MacOSX')
test['OS_id_30'] = test['id_30'].str.split(' ', expand=True)[0]
test['version_id_30'] = test['id_30'].str.split(' ', expand=True)[1]

def matchregex(x):
    x = str(x)
    # returnList = []
    # extraList = []
    # for x in mylist:
    if re.match(r'.*chrome.*',x.lower()):
        return ("Chrome")
    elif re.match(r'.*chromium.*',x.lower()):
        return ("Chromium")
    elif re.match(r'.*opera.*',x.lower()):
        return ("Opera")
    elif re.match(r'.*firefox.*',x.lower()):
        return ("Firefox")
    elif re.match(r'.*puffin.*',x.lower()):
        return ("Puffin")
    elif re.match(r'.*safari.*',x.lower()):
        return ("Safari")
    elif re.match(r'.*edge.*',x.lower()):
        return ("IE")
    elif re.match(r'.*ie.*',x.lower()):
        return ("IE")
    elif re.match(r'.*google.*',x.lower()):
        return ("Google")
    elif re.match(r'.*samsung.*',x.lower()):
        return ("Native")
    elif re.match(r'.*android.*',x.lower()):
        return ("Native")
    else:
        return ("Generic")
    
def getVersion(mystring):
    temp = str(mystring).split(' ')
    for x in temp:
        try: 
            myfloat = float(x)
            return(myfloat)
        except:
            continue
    return np.nan

train['browser']=train['id_31'].apply(matchregex)
train['b_version']=train['id_31'].apply(getVersion)
train['b_version'] = np.where(train['browser'] == 'Native',np.nan, train['b_version'])
train['b_version'] = np.where(train['browser'] == 'Generic',np.nan, train['b_version'])

test['browser']=test['id_31'].apply(matchregex)
test['b_version']=test['id_31'].apply(getVersion)
test['b_version'] = np.where(test['browser'] == 'Native',np.nan, test['b_version'])
test['b_version'] = np.where(test['browser'] == 'Generic',np.nan, test['b_version'])

def findDeviceInfo(inputString):
    if re.match(r'.*sm.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*sg.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*samsung.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*gt.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*pixel.*',str(inputString).lower()):
        return ("Google")
    elif re.match(r'.*nexus.*',str(inputString).lower()):
        return ("Google")
    elif re.match(r'.*windows.*',str(inputString).lower()):
        return ("Windows")
    elif re.match(r'.*asus.*',str(inputString).lower()):
        return ("ASUS")
    elif re.match(r'.*lg.*',str(inputString).lower()):
        return ("LG")
    elif re.match(r'.*vs.*',str(inputString).lower()):
        return ("LG")
    elif re.match(r'.*ios.*',str(inputString).lower()):
        return ("Apple")
    elif re.match(r'.*macos.*',str(inputString).lower()):
        return ("Apple")
    elif re.match(r'.*moto.*',str(inputString).lower()):
        return ("Motorola")
    elif re.match(r'.*huawei.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*ale-.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*-l.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*blade.*',str(inputString).lower()):
        return ("BLADE")
    elif re.match(r'.*htc.*',str(inputString).lower()):
        return ("HTC")
    elif re.match(r'.*redmi.*',str(inputString).lower()):
        return ("Redmi")
    elif re.match(r'.*lenovo.*',str(inputString).lower()):
        return ("Lenovo")
    elif re.match(r'.*android.*',str(inputString).lower()):
        return ("Android")
    elif re.match(r'.*e5306.*',str(inputString).lower()):
        return ("SONY")
    elif re.match(r'.*f3213.*',str(inputString).lower()):
        return ("SONY")
    elif re.match(r'.*ilium.*',str(inputString).lower()):
        return ("Ilium")
    elif re.match(r'.*trident.*',str(inputString).lower()):
        return ("Trident")
    elif re.match(r'.*rv:.*',str(inputString).lower()):
        return ("Rv")
    elif re.match(r'.*linux.*',str(inputString).lower()):
        return ("Linux")
    elif re.match(r'.*Hisense.*',str(inputString).lower()):
        return ("Hisense")
    else:
        return ("Others")

train['DeviceInfo'] = train['DeviceInfo'].apply(findDeviceInfo)
test['DeviceInfo'] = test['DeviceInfo'].apply(findDeviceInfo)

train['TransactionAmt'] = (train['TransactionAmt']*100).astype(int)
test['TransactionAmt'] = (test['TransactionAmt']*100).astype(int)

In [5]:
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols + one_value_cols_test))
cols_to_drop.remove('isFraud')
cols_to_drop.extend(['id_30','id_31'])
print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))
print(cols_to_drop)

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

train.to_csv("train_prepared.csv")
test.to_csv("test_prepared.csv")

84 features are going to be dropped for being useless
['V103', 'V28', 'V136', 'V300', 'V114', 'V105', 'V14', 'V115', 'V125', 'V119', 'V295', 'id_07', 'C3', 'V111', 'V132', 'V309', 'V117', 'V121', 'V25', 'id_27', 'V23', 'V77', 'id_21', 'V55', 'V104', 'V297', 'V86', 'V311', 'V98', 'V134', 'V123', 'id_18', 'V133', 'V320', 'V110', 'V116', 'V299', 'id_23', 'V65', 'V24', 'V102', 'V118', 'V122', 'V290', 'id_24', 'V109', 'V66', 'V124', 'id_08', 'V106', 'V284', 'V67', 'V101', 'V293', 'V318', 'V298', 'V120', 'id_26', 'V305', 'V108', 'V26', 'V296', 'V129', 'V286', 'V88', 'V281', 'id_25', 'V316', 'V112', 'dist2', 'V135', 'V27', 'V301', 'V321', 'D7', 'id_22', 'V137', 'V113', 'V107', 'V68', 'V319', 'V89', 'id_30', 'id_31']


In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 354/354 [00:22<00:00, 15.76it/s]


In [8]:
del test
gc.collect()

33

In [9]:
train = reduce_mem_usage(train)

Mem. usage decreased to 428.58 Mb (73.2% reduction)


In [10]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

del train
gc.collect()

20

In [11]:
# RFECV does not support NaNs
X.fillna(-999, inplace=True)

In [12]:
params = {'num_leaves': 500,
          'min_child_weight': 0.1,
          'feature_fraction': 0.99,
          'bagging_fraction': 0.01,
          'min_data_in_leaf': 165,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.1,
          "boosting_type": "gbdt",
          "bagging_seed": 4041,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 1.,
          'reg_lambda': 2.0,
          'random_state': 4041
         }

In [13]:
clf = lgb.LGBMClassifier(**params)
rfe = RFECV(estimator=clf, step=10, cv=KFold(n_splits=5, shuffle=False), scoring='roc_auc', verbose=2)

In [14]:
rfe.fit(X, y)

Fitting estimator with 351 features.
Fitting estimator with 341 features.
Fitting estimator with 331 features.
Fitting estimator with 321 features.
Fitting estimator with 311 features.
Fitting estimator with 301 features.
Fitting estimator with 291 features.
Fitting estimator with 281 features.
Fitting estimator with 271 features.
Fitting estimator with 261 features.
Fitting estimator with 251 features.
Fitting estimator with 241 features.
Fitting estimator with 231 features.
Fitting estimator with 221 features.
Fitting estimator with 211 features.
Fitting estimator with 201 features.
Fitting estimator with 191 features.
Fitting estimator with 181 features.
Fitting estimator with 171 features.
Fitting estimator with 161 features.
Fitting estimator with 151 features.


KeyboardInterrupt: 

In [None]:
print('Optimal number of features:', rfe.n_features_)

In [None]:
plt.figure(figsize=(14, 8))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()

In [None]:
list =[]
for col in X.columns[rfe.ranking_ == 1]:
    print(col)
    list.append(col)