# Report for Transaction-Receipt Matcher Task

In [None]:
import cupy as cp # linear algebra
import cudf # data processing, CSV file I/O (e.g. cudf.read_csv)
from cuml.preprocessing.model_selection import train_test_split
from cuml.metrics import roc_auc_score, confusion_matrix, precision_recall_curve
from cuml.preprocessing.LabelEncoder import LabelEncoder

from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split as sktrain_test_split
import sklearn
from random import shuffle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost
from cupy import asnumpy

from ray.tune.schedulers import ASHAScheduler
from ray import tune
from ray.tune.integration.xgboost import TuneReportCheckpointCallback


import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pprint

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)

### **Preview dataset** <a class="anchor" id="4.3.2"></a> 

In [None]:
data = cudf.read_csv('/kaggle/input/tide-data/data_interview_test (1).csv',sep=":")

pdf = pd.read_csv('/kaggle/input/tide-data/data_interview_test (1).csv',sep=":")
data.head()

### Shape of dataset

In [None]:
data.shape

We have total 12034 samples and each have mathcing feature vector of size of 10

### Create labels 
- Transaction and Receipt are correct matches where matched_transaction_id = feature_transaction_id
- We create a column for the labels using above condition

In [None]:
data['label']= (data.matched_transaction_id == data.feature_transaction_id).astype(int)
pdf['label'] = (pdf.matched_transaction_id == pdf.feature_transaction_id).astype(int)

### Let's check the distribution of label ( or correct vs wrong matches)

In [None]:
pdf.hist('label')
print(pdf.label.value_counts())

- The total samples in data are 12304 out of which 11177 are wrong matchings and only 857 correct match
- Highly unbalanced data

## Evaluation Metric
- Befor creating the models, we need to decide a evaluation metric.
- Simple accuracy is bad metric here due to higly imbalanced data as randomly labeling as all incorrect matching the accuracy will be very high.
- Recall = true positives/(true positives + false negatives) 
- Precison = true positives/(true positives + false positives)
- recall is could be looked as ability of model to correctly label all true positives. and precison is ability to not label negative sample as positive.
- Both of these are important but the major priority we will consider as to get a high Recall, so that correct matching could be found.

## Baseline
- Create a baseline model using matching vector as feature.
- Starting with simple linear regression and random forest, and xgboost model.
- Xgboost model produces better result for baseline, so moving forward we will be optimizing the xgboost model only.


In [None]:


data['label']= (data.matched_transaction_id == data.feature_transaction_id).astype(int)
x,y = data[list(data.columns[4:])], data.label
x=x.drop(['label'], axis=1)  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,shuffle=True)
    
train = xgboost.DMatrix(X_train, label=y_train)
test = xgboost.DMatrix(X_test, label=y_test)
watchlist = [(test, 'eval'), (train, 'train')]

xgb_params = {
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',
            'max_depth': 2, 
            'eta':0.1,
            'silent':1,
            'subsample':0.5,
            'colsample_bytree': 0.05,
            
}

clf = xgboost.train( xgb_params,train, num_boost_round=10000,                 )

preds = clf.predict(test)



In [None]:
print(classification_report(asnumpy(y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['label-0', 'label-1']
#print(confusion_matrix((y_test),(preds.round(0).astype(int))))  


cm = sklearn.metrics.confusion_matrix(asnumpy(y_test),asnumpy(preds.round(0).astype(int)))
 

ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
        

- The baseline XGB produces decent results the precesion is high, but recall is low, meaning the model is missing lot of correct matches, which is not good.


### Feature Engineering
- Now we dont have much inforamtion about how the features were created so it's hard to engineer any more features out of these, but looking at some of the features it could be seen all are values between 0-1 and we need to see how these value are related to final label.
- Lets see the how distrubution of values of one of the feature column looks.



In [None]:
pdf.hist("DateMappingMatch")
print(" Distribuition of DateMappingMatch feature values")

- It could be seen  DateMappingMatch is 0 for more than 8k samples.
- We can check if this distribution is related to the label:

In [None]:
dg = data.groupby('DateMappingMatch').agg({'label':['mean']})
dg.columns = ['label_mean']
ax = dg.label_mean.to_pandas().plot.bar()
ax.set_ylabel('label_mean')

print ("DateMappingMatch related to average label value ")

- Now it could be seen here that for DateMappingMatch values > 0.8 the corresponds to higher label mean, or larger DateMappingMatch value is realted to postive label.
-  We can group these value in bins and count the size of each bin to create a new feature.
- Using these new feature we train the model again.

In [None]:

org_data = data.copy()
feat_col=data.columns[4:-1]
le = LabelEncoder()
for col in feat_col:
    var_count = data.groupby(col).agg({col:'count'})
    var_count.columns = ['%s_count'%col]
    var_count = var_count.reset_index()
    data = data.merge(var_count,on=col,how='left')
    le.fit(data['%s_count'%col])  
    encoded = le.transform(data['%s_count'%col])
    data['%s_count'%col] = encoded/encoded.max()
    


x,y = data[list(data.columns[4:])], data.label
x=x.drop(['label'], axis=1)  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,shuffle=True)
    
train = xgboost.DMatrix(X_train, label=y_train)
test = xgboost.DMatrix(X_test, label=y_test)
watchlist = [(test, 'eval'), (train, 'train')]

xgb_params = {
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',
            'max_depth': 2, 
            'eta':0.1,
            'silent':1,
            'subsample':0.5,
            'colsample_bytree': 0.05,
            
}

clf = xgboost.train( xgb_params,train, num_boost_round=10000,                 )

preds = clf.predict(test)
print(classification_report(asnumpy(y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['label-0', 'label-1']
#print(confusion_matrix((y_test),(preds.round(0).astype(int))))  


cm = sklearn.metrics.confusion_matrix(asnumpy(y_test),asnumpy(preds.round(0).astype(int)))
 

ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax);

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
        


- As seen from above results the new features doesn't make much diffrence
- For now we won't use them as they don't provide any significant improvenment but will cause problems as creating these features during testing on new data will not be straight forward.
- Also these feature are directly produced from distribution of training data, and this will be change in testing data, so these features are not robust as more data comes in future.

### We need to handle the class imbalance to improve the results.
-  The best method would to have weighted loss, meaning if rare class is missclassifed the model would penalised more.
- Now the weight by which we should be mupltiplying the loss could be taken as the imbalance proportion or any other value also might work, for now we could use 13 (which is class imbalance ratio)

In [None]:

data = org_data
data['label']= (data.matched_transaction_id == data.feature_transaction_id).astype(int)
x,y = data[list(data.columns[4:])], data.label
x=x.drop(['label'], axis=1)  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,shuffle=True)
    
train = xgboost.DMatrix(X_train, label=y_train)
test = xgboost.DMatrix(X_test, label=y_test)
watchlist = [(test, 'eval'), (train, 'train')]

xgb_params = {
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',
            'max_depth': 2, 
            'eta':0.1,
            'silent':1,
            'subsample':0.5,
            'colsample_bytree': 0.05,
            'scale_pos_weight':13,
}

clf = xgboost.train( xgb_params,train, num_boost_round=20000,                 )

preds = clf.predict(test)

print(classification_report(asnumpy(y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['label-0', 'label-1']
#print(confusion_matrix((y_test),(preds.round(0).astype(int))))  


cm = sklearn.metrics.confusion_matrix(asnumpy(y_test),asnumpy(preds.round(0).astype(int)))
 

ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
        

- So this weighted loss trick improved the recall value significantly, although precesion for is not that high, but better a high recall value is more preferable as it corresponds to correctly finding the matching reciept.
- Now as seen from Confusion Matrix, we can see we have more false postives (527) now although this is bad we shoudld check the confidence of model in producing these false postives.

In [None]:
tp = np.logical_and(preds.round(0)==1,asnumpy(y_test)==1)
fp = np.logical_and(preds.round(0)==1,asnumpy(y_test)==0)
xtest = X_test.copy()
xtest['scores']=preds
xtest.scores[fp].describe()

- The above stats signify that mean confidance of model classifying false postives is 0.72 also majority of false positivea have around 0.71 confidence, which is not good as model showing dignigicant confidance in wrong predection
- We can check the model confidence for true postives (meaning correctly classifying the postive class) :

In [None]:
xtest.scores[tp].describe()

 - mean confidence for true postive is 0.88 which is good and higher than mean confidence in false postives, but it is still a problem, we somehow need to increase this diffrence without affecting recall value much.
 - Now the reason for this might be the we maybe using high weight(13) in the weighted loss, we can try to bring it down.

In [None]:

data = org_data
data['label']= (data.matched_transaction_id == data.feature_transaction_id).astype(int)
x,y = data[list(data.columns[4:])], data.label
x=x.drop(['label'], axis=1)  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,shuffle=True)
    
train = xgboost.DMatrix(X_train, label=y_train)
test = xgboost.DMatrix(X_test, label=y_test)
watchlist = [(test, 'eval'), (train, 'train')]

xgb_params = {
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',
            'max_depth': 2, 
            'eta':0.1,
            'silent':1,
            'subsample':0.5,
            'colsample_bytree': 0.05,
            'scale_pos_weight':8,
}

clf = xgboost.train( xgb_params,train, num_boost_round=20000,                 )

preds = clf.predict(test)

print(classification_report(asnumpy(y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['label-0', 'label-1']
#print(confusion_matrix((y_test),(preds.round(0).astype(int))))  


cm = sklearn.metrics.confusion_matrix(asnumpy(y_test),asnumpy(preds.round(0).astype(int)))
 

ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
        

In [None]:
tp = np.logical_and(preds.round(0)==1,asnumpy(y_test)==1)
fp = np.logical_and(preds.round(0)==1,asnumpy(y_test)==0)
xtest = X_test.copy()
xtest['scores']=preds
print ( "false postive stats")
print(xtest.scores[fp].describe())
print()
print ( "True postive stats")
print(xtest.scores[tp].describe())

 - Okay so we have brought the model confidence  in false postives lower and also diffrence of confidence between the 2 cases increased which is good with new weighted loss parameter as 8 instead of 13, and overall model is still not affected that much.
 - We are going in right direction.
 - Next step is to see if we can have better Hyperparameters for the model to further boost the performance.


### HyperParameter optimization 
- now that we have very good model already, we can try to select best hyperparmeters for the xgb model we are using to further improve the performance.
- We will be suing Ray Tune library for this purpose as it integrated nicely with the Cuml and Xgboost library we are using for faster trainging on GPU (compared to normal Scikit-Learn models on CPU).
- Now after lot of trials setting diffrenet parameter search space, the resulting model is a huge improvenment over our previous best, also when the recall value is comaprable on cheking it is found that the model confidence is very low for both true psotives almsot same as for false positives which is not correct. 
- We can try different Hyperparameter search algorithms but the major problem comes in setting right metric for the search as simple accuracy or loss or AUC doesn't produce good result for our aim for matching right recipt with good confidence, also it's very time consuming and it very often produce overfitted model due to amount of data.
- So we will won't use the model from the hyperparameter search results and go out with intial choosen model.

Results from HyperParameter Search :

In [None]:
!pip uninstall -y dataclasses

In [None]:

def prep_data(path):
    data = cudf.read_csv(path,sep=":")
    data['label']= (data.matched_transaction_id == data.feature_transaction_id).astype(int)
    feat_col=data.columns[4:-1]
    le = LabelEncoder()
    for col in feat_col:
        var_count = data.groupby(col).agg({col:'count'})
        var_count.columns = ['%s_count'%col]
        var_count = var_count.reset_index()
        data = data.merge(var_count,on=col,how='left')
        le.fit(data['%s_count'%col])  
        encoded = le.transform(data['%s_count'%col])
        data['%s_count'%col] = encoded/encoded.max()
    x,y = data[list(data.columns[4:])], data.label
    x=x.drop(['label'], axis=1)  
    return x,y
path = "/kaggle/input/tide-data/data_interview_test (1).csv"
x,y = prep_data(path)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,shuffle=True)

def train_receipt_match(config):
       
    train = xgboost.DMatrix(X_train, label=y_train)
    test = xgboost.DMatrix(X_test, label=y_test)
    watchlist = [(test, 'eval'), (train, 'train')]
    clf = xgboost.train(config, train, num_boost_round=10000,
                        evals=watchlist,
                        maximize=True,
                        verbose_eval=1000,
                        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")]
                       )
    

search_space = {
    # You can mix constants with search space objects.
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error", "aucpr"],
    "max_depth": tune.randint(1, 9),
    "min_child_weight": tune.choice([1, 2, 3]),
    "subsample": tune.uniform(0.4, 1.0),
    "eta": tune.loguniform(1e-4, 1e-1),
    "scale_pos_weight":tune.randint(3, 13),
}

# This will enable aggressive early stopping of bad trials.
scheduler = ASHAScheduler(
    max_t=10,  # 10 training iterations
    grace_period=1,
    reduction_factor=2)

analysis = tune.run(
    train_receipt_match,
    metric="eval-aucpr",
    mode="max",
    # You can add "gpu": 0.1 to allocate GPUs
    resources_per_trial={"gpu": 1},
    config=search_space,
    num_samples=10,
    scheduler=scheduler)

        

- As seen below sometime the resulting model from hyper-parameter search performs lesser than our previous best.
- Also in some experiments hyper parameter search produces overfitted model as it usualy produces model with high depth of trees which make Xgboost model more complex and subject to overfitting.

In [None]:
best_bst = xgboost.Booster()
best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb"))
aucpr = analysis.best_result["eval-aucpr"]
pprint.pprint(f"Best model parameters: {analysis.best_config}")
print(f"Best model total eval-aucpr: {aucpr}")

preds = best_bst.predict(test)

print(classification_report(asnumpy(y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['label-0', 'label-1']
#print(confusion_matrix((y_test),(preds.round(0).astype(int))))  


cm = sklearn.metrics.confusion_matrix(asnumpy(y_test),asnumpy(preds.round(0).astype(int)))
 

ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
        

### Conclusion

- We were able to make good classifier which is able to match the correct ‘transaction-receipt’ with high confidence.
- Selecting the right metric was hard as all the usual metrics comes with some issues, and we need to further analyse the model to check how confident the model is in making right or wrong decision even it produce results on choosen metrics.
- Hyper-Parameter search is hard for such kind of imbalanced data and low number of samples.
- Due to limited information on current feature vector new feature engineering is difficult.

- Creating an ensemble of models would have resulted in little better performance but it's hard to deploy, so did't created any ensemble of multiple models.
- We did our complete modeling using Libraries(Cuml, Cudf) from RapidsAI which enables us to use ML algorithms on GPU's.
- Also we can port the model directly to be used on CPU without any extra step and we dont need to use these libs during inference.