In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
import lightgbm as lgb
import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import multiprocessing
import gc
import re

In [2]:
files = ['input/test_identity.csv', 
         'input/test_transaction.csv',
         'input/train_identity.csv',
         'input/train_transaction.csv',
         'input/sample_submission.csv']

def load_data(file):
    return pd.read_csv(file)

# with multiprocessing.Pool() as pool:
#     test_id, test_tr, train_id, train_tr, sub = pool.map(load_data, files)
test_id = load_data('input/test_identity.csv')
test_tr = load_data('input/test_transaction.csv')
train_id = load_data('input/train_identity.csv')
train_tr = load_data('input/train_transaction.csv')
sub = load_data('input/sample_submission.csv')

In [3]:
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

nameMap = {}
for col in test.columns:
    if '-' in col:
        nameMap[col] = col.replace('-','_')
test.rename(columns=nameMap, inplace=True)

del test_id, test_tr, train_id, train_tr
gc.collect()

one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols + one_value_cols_test))
cols_to_drop.remove('isFraud')
print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))
print(cols_to_drop)

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)


82 features are going to be dropped for being useless
['V68', 'id_22', 'V296', 'V112', 'id_23', 'V102', 'V132', 'V133', 'V26', 'V77', 'id_21', 'V27', 'V311', 'V111', 'V309', 'V101', 'id_25', 'V14', 'V110', 'V293', 'V65', 'V321', 'V117', 'V24', 'V114', 'V305', 'V113', 'V115', 'V290', 'V295', 'id_24', 'V118', 'V55', 'V300', 'id_08', 'V25', 'V316', 'id_07', 'V121', 'V98', 'V284', 'C3', 'V286', 'id_26', 'V106', 'V125', 'id_27', 'V23', 'V124', 'V109', 'V120', 'V103', 'D7', 'V122', 'V107', 'V108', 'V134', 'V104', 'V66', 'V320', 'V129', 'V319', 'id_18', 'V299', 'V298', 'V318', 'V67', 'V135', 'V28', 'V301', 'V119', 'V89', 'V116', 'V88', 'V86', 'V281', 'V297', 'dist2', 'V136', 'V123', 'V105', 'V137']


In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [24]:
testing1 = train["DeviceInfo"]
def findDeviceInfo(inputString):
    if re.match(r'.*sm.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*sg.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*samsung.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*gt.*',str(inputString).lower()):
        return ("Samsung")
    elif re.match(r'.*pixel.*',str(inputString).lower()):
        return ("Google")
    elif re.match(r'.*nexus.*',str(inputString).lower()):
        return ("Google")
    elif re.match(r'.*windows.*',str(inputString).lower()):
        return ("Windows")
    elif re.match(r'.*asus.*',str(inputString).lower()):
        return ("ASUS")
    elif re.match(r'.*lg.*',str(inputString).lower()):
        return ("LG")
    elif re.match(r'.*vs.*',str(inputString).lower()):
        return ("LG")
    elif re.match(r'.*ios.*',str(inputString).lower()):
        return ("Apple")
    elif re.match(r'.*macos.*',str(inputString).lower()):
        return ("Apple")
    elif re.match(r'.*moto.*',str(inputString).lower()):
        return ("Motorola")
    elif re.match(r'.*huawei.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*ale-.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*-l.*',str(inputString).lower()):
        return ("HUAWEI")
    elif re.match(r'.*blade.*',str(inputString).lower()):
        return ("BLADE")
    elif re.match(r'.*htc.*',str(inputString).lower()):
        return ("HTC")
    elif re.match(r'.*redmi.*',str(inputString).lower()):
        return ("Redmi")
    elif re.match(r'.*lenovo.*',str(inputString).lower()):
        return ("Lenovo")
    elif re.match(r'.*android.*',str(inputString).lower()):
        return ("Android")
    elif re.match(r'.*e5306.*',str(inputString).lower()):
        return ("SONY")
    elif re.match(r'.*f3213.*',str(inputString).lower()):
        return ("SONY")
    elif re.match(r'.*ilium.*',str(inputString).lower()):
        return ("Ilium")
    elif re.match(r'.*trident.*',str(inputString).lower()):
        return ("Trident")
    elif re.match(r'.*rv:.*',str(inputString).lower()):
        return ("Rv")
    elif re.match(r'.*linux.*',str(inputString).lower()):
        return ("Linux")
    elif re.match(r'.*Hisense.*',str(inputString).lower()):
        return ("Hisense")
    else:
        return ("Others")

testing1.apply(findDeviceInfo)

0          Others
1          Others
2          Others
3          Others
4         Samsung
           ...   
590535     Others
590536     Others
590537     Others
590538     Others
590539     Others
Name: DeviceInfo, Length: 590540, dtype: object

In [28]:
testing2 = train["TransactionAmt"]

testing2 = testing2*100

testing2 = testing2.astype(int)

testing2.head()

0    6850
1    2900
2    5900
3    5000
4    5000
Name: TransactionAmt, dtype: int32