# **SETUP**

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
%%capture
!pip install catboost

# **Importing libraries**

In [164]:
import io
import os
import gc
import re
import random
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import  accuracy_score

from sklearn.metrics import accuracy_score, log_loss
from catboost import CatBoostClassifier, Pool

In [82]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

# **Load the dataset**

In [106]:
# Testing path
path = "/content/drive/MyDrive/BankTransactionClassifier/"

In [107]:
# Load the files into a Pandas Dataframe
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')
ss = pd.read_csv(path+'SampleSubmission.csv')
testID = test["ID"]

In [108]:
selectCol = ["DATE", "TRANSACTION DETAILS", "Account_NO", "WITHDRAWAL AMT", "DEPOSIT AMT", "BALANCE AMT"]
label = ["Category"]

In [109]:
import re
train = train[selectCol + label].dropna()
test = test[selectCol]

train["TRANSACTION DETAILS"] = train["TRANSACTION DETAILS"].apply(lambda x : re.sub(r'\W','',str(x)))
test["TRANSACTION DETAILS"] = test["TRANSACTION DETAILS"].apply(lambda x : re.sub(r'\W','',str(x)))

In [110]:
#train["TRANSACTION DETAILS"] = train["TRANSACTION DETAILS"].apply(lambda x : re.sub(r'\d','',str(x)))
#test["TRANSACTION DETAILS"] = test["TRANSACTION DETAILS"].apply(lambda x : re.sub(r'\d','',str(x)))

In [111]:
def getKmers(sequence, size = 6):
  return [sequence[x:x+size] for x in range(len(sequence) - size + 1)]

def createNewSequence(sequence):
  newSeq = [sequence]
  for size in [3, 4, 5, 6, 7]:
    newSeq = newSeq + getKmers(sequence, size = 6)
  return re.sub("\s\d*\s", " NUMBERS "," ".join(newSeq))

In [112]:
createNewSequence(sequence = "NEFTFDRL401249529INDIAFORENSIC")

'NEFTFDRL401249529INDIAFORENSIC NEFTFD EFTFDR FTFDRL TFDRL4 FDRL40 DRL401 RL4012 L40124 NUMBERS 012495 NUMBERS 249529 49529I 9529IN 529IND 29INDI 9INDIA INDIAF NDIAFO DIAFOR IAFORE AFOREN FORENS ORENSI RENSIC NEFTFD EFTFDR FTFDRL TFDRL4 FDRL40 DRL401 RL4012 L40124 NUMBERS 012495 NUMBERS 249529 49529I 9529IN 529IND 29INDI 9INDIA INDIAF NDIAFO DIAFOR IAFORE AFOREN FORENS ORENSI RENSIC NEFTFD EFTFDR FTFDRL TFDRL4 FDRL40 DRL401 RL4012 L40124 NUMBERS 012495 NUMBERS 249529 49529I 9529IN 529IND 29INDI 9INDIA INDIAF NDIAFO DIAFOR IAFORE AFOREN FORENS ORENSI RENSIC NEFTFD EFTFDR FTFDRL TFDRL4 FDRL40 DRL401 RL4012 L40124 NUMBERS 012495 NUMBERS 249529 49529I 9529IN 529IND 29INDI 9INDIA INDIAF NDIAFO DIAFOR IAFORE AFOREN FORENS ORENSI RENSIC NEFTFD EFTFDR FTFDRL TFDRL4 FDRL40 DRL401 RL4012 L40124 NUMBERS 012495 NUMBERS 249529 49529I 9529IN 529IND 29INDI 9INDIA INDIAF NDIAFO DIAFOR IAFORE AFOREN FORENS ORENSI RENSIC'

In [113]:
train["Sequence"] = train["TRANSACTION DETAILS"].apply(lambda x: createNewSequence(str(x)))
test["Sequence"] = test["TRANSACTION DETAILS"].apply(lambda x: createNewSequence(str(x)))

train["Sequence_Length"] = train["Sequence"].apply(lambda x: len(x.split()))
test["Sequence_Length"] = test["Sequence"].apply(lambda x: len(x.split()))

In [115]:
LABEL2ID = {label:i for i, label in enumerate(train['Category'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train['label_ids'] = train['Category'].map(LABEL2ID)

In [119]:
train['Category'].value_counts()

Money-Transfer                      33555
Deposit                             12431
Merchant-Payment                     3160
Cheque-Payment                       1427
Bank Charges                         1122
Cash-Pickup                          1064
Salary and wages                      808
Bill-Payments                         482
Shopping                              119
Cell Phone and Airtime                103
Interest                               70
Loan Repayment                         69
Reversal                               66
Internet and IT Services               22
Insurance                              20
Professional services                  15
Savings and Investments                10
General Purchases                       4
Transport, Travel, and Logistics        2
Donations                               2
Entertainment                           1
Name: Category, dtype: int64

In [120]:
lessrepclass = ['Professional services', 'Savings and Investments', 'General Purchases',
       'Transport, Travel, and Logistics', 'Donations', 'Entertainment']

In [158]:
lessrepclass_x =  pd.DataFrame(train.set_index("Category").loc[lessrepclass, :].reset_index()["Sequence"]) #
lessrepclass_y =  train.set_index("Category").loc[lessrepclass, :].reset_index()["label_ids"].to_list()

In [159]:
lessrepclass_y 

[17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 18,
 18,
 18,
 18,
 13,
 13,
 19,
 19,
 16]

In [132]:
train = train.reset_index(drop = True)

In [196]:
train_targets = train['label_ids'].values
train_text = train[['Sequence']]
test_text = train[['Sequence']]
test_features = test[['Sequence']]


cv = list(StratifiedKFold(n_splits=10).split(train, train['label_ids']))

# **CATBOOST MODEL**

In [197]:
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'task_type': 'GPU',
    'early_stopping_rounds': 50,
    'use_best_model': True,
    'verbose': 50,
}

In [198]:
%%time
val_scores = []
val_preds = np.zeros((len(train), len(ID2LABEL)), dtype="float32")
test_preds = np.zeros((10, len(test), len(ID2LABEL)), dtype="float32")

for fold in range(10):
    print('='*30)
    print(f'======fold: {fold} start======')

    trn_idx, val_idx = cv[fold]
    trn_features, val_features = pd.concat([train_text.loc[trn_idx],lessrepclass_x], axis = 0).reset_index(drop = True),pd.concat([train_text.loc[val_idx],lessrepclass_x], axis = 0).reset_index(drop = True)
    trn_targets, val_targets = list(train_targets[trn_idx]) + lessrepclass_y, list(train_targets[val_idx]) + lessrepclass_y

    train_pool = Pool(
        trn_features, 
        trn_targets, 
        text_features=['Sequence'],
    )
    valid_pool = Pool(
        val_features, 
        val_targets, 
        text_features=['Sequence'],
    )

    model = CatBoostClassifier(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool)
    val_pred = model.predict_proba(val_features)
    score = log_loss(val_targets, val_pred)

    print(f"score {score:.4f}")

    val_preds[val_idx] = model.predict_proba(train_text.loc[val_idx])
    test_preds[fold] = model.predict_proba(test_features)

0:	learn: 0.9476694	test: 0.9280510	best: 0.9280510 (0)	total: 88.6ms	remaining: 1m 28s
50:	learn: 0.9956849	test: 0.9872495	best: 0.9872495 (40)	total: 1.87s	remaining: 34.9s
bestTest = 0.9872495446
bestIteration = 40
Shrink model to first 41 iterations.
score 0.0851
0:	learn: 0.9468349	test: 0.9360656	best: 0.9360656 (0)	total: 78.5ms	remaining: 1m 18s
50:	learn: 0.9955221	test: 0.9928962	best: 0.9928962 (38)	total: 1.84s	remaining: 34.3s
bestTest = 0.9928961749
bestIteration = 38
Shrink model to first 39 iterations.
score 0.0731
0:	learn: 0.9467139	test: 0.9366005	best: 0.9366005 (0)	total: 119ms	remaining: 1m 58s
50:	learn: 0.9959293	test: 0.9912552	best: 0.9912552 (44)	total: 2s	remaining: 37.3s
bestTest = 0.9912552377
bestIteration = 44
Shrink model to first 45 iterations.
score 0.0730
0:	learn: 0.9447599	test: 0.9548187	best: 0.9548187 (0)	total: 79.2ms	remaining: 1m 19s
50:	learn: 0.9951355	test: 0.9934414	best: 0.9934414 (45)	total: 1.83s	remaining: 34.1s
100:	learn: 0.9957257

In [199]:
print(f"all oof score {log_loss(train_targets, val_preds)}")

all oof score 0.05635394761973916


In [200]:
pd.DataFrame(np.mean(test_preds, axis = 0))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.989672,0.000581,0.000739,0.000656,0.000716,0.000876,0.000711,0.000722,0.000403,0.000385,...,0.000444,0.000419,0.000348,0.000379,0.000452,0.000331,0.000402,0.000366,0.000349,0.000402
1,0.989672,0.000581,0.000739,0.000656,0.000716,0.000876,0.000711,0.000722,0.000403,0.000385,...,0.000444,0.000419,0.000348,0.000379,0.000452,0.000331,0.000402,0.000366,0.000349,0.000402
2,0.014940,0.005196,0.932527,0.006070,0.005598,0.004969,0.007083,0.002278,0.001441,0.001398,...,0.002478,0.001748,0.000822,0.001366,0.002438,0.000722,0.001579,0.000961,0.000823,0.001711
3,0.989794,0.000574,0.000729,0.000648,0.000707,0.000864,0.000703,0.000717,0.000398,0.000380,...,0.000439,0.000414,0.000344,0.000374,0.000447,0.000327,0.000397,0.000362,0.000344,0.000397
4,0.014940,0.005196,0.932527,0.006070,0.005598,0.004969,0.007083,0.002278,0.001441,0.001398,...,0.002478,0.001748,0.000822,0.001366,0.002438,0.000722,0.001579,0.000961,0.000823,0.001711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22620,0.003801,0.001884,0.002533,0.002008,0.005320,0.002889,0.970729,0.001068,0.000778,0.000771,...,0.000938,0.000797,0.000480,0.000752,0.000934,0.000423,0.000833,0.000538,0.000477,0.000766
22621,0.003801,0.001884,0.002533,0.002008,0.005320,0.002889,0.970729,0.001068,0.000778,0.000771,...,0.000938,0.000797,0.000480,0.000752,0.000934,0.000423,0.000833,0.000538,0.000477,0.000766
22622,0.003801,0.001884,0.002533,0.002008,0.005320,0.002889,0.970729,0.001068,0.000778,0.000771,...,0.000938,0.000797,0.000480,0.000752,0.000934,0.000423,0.000833,0.000538,0.000477,0.000766
22623,0.003801,0.001884,0.002533,0.002008,0.005320,0.002889,0.970729,0.001068,0.000778,0.000771,...,0.000938,0.000797,0.000480,0.000752,0.000934,0.000423,0.000833,0.000538,0.000477,0.000766


In [201]:
sub = pd.DataFrame()
sub["ID"] = testID
sub[list(ID2LABEL.values())]  = np.mean(test_preds, axis = 0)

In [202]:
sub.head()

Unnamed: 0,ID,Money-Transfer,Salary and wages,Bank Charges,Cash-Pickup,Cheque-Payment,Deposit,Merchant-Payment,Reversal,Savings and Investments,...,Shopping,Interest,"Transport, Travel, and Logistics",Insurance,Cell Phone and Airtime,Entertainment,Professional services,General Purchases,Donations,Loan Repayment
0,2014-02-28 00:00:00X3000000000,0.989672,0.000581,0.000739,0.000656,0.000716,0.000876,0.000711,0.000722,0.000403,...,0.000444,0.000419,0.000348,0.000379,0.000452,0.000331,0.000402,0.000366,0.000349,0.000402
1,2014-02-28 00:00:00X7500000000,0.989672,0.000581,0.000739,0.000656,0.000716,0.000876,0.000711,0.000722,0.000403,...,0.000444,0.000419,0.000348,0.000379,0.000452,0.000331,0.000402,0.000366,0.000349,0.000402
2,2014-03-08 00:00:00X4500002863,0.01494,0.005196,0.932527,0.00607,0.005598,0.004969,0.007083,0.002278,0.001441,...,0.002478,0.001748,0.000822,0.001366,0.002438,0.000722,0.001579,0.000961,0.000823,0.001711
3,2014-03-08 00:00:00X6000002863,0.989794,0.000574,0.000729,0.000648,0.000707,0.000864,0.000703,0.000717,0.000398,...,0.000439,0.000414,0.000344,0.000374,0.000447,0.000327,0.000397,0.000362,0.000344,0.000397
4,2014-03-08 00:00:00X6000005726,0.01494,0.005196,0.932527,0.00607,0.005598,0.004969,0.007083,0.002278,0.001441,...,0.002478,0.001748,0.000822,0.001366,0.002438,0.000722,0.001579,0.000961,0.000823,0.001711


In [203]:
sub.to_csv("CatBoostModel10Folds.csv", index = False)