In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,ConfusionMatrixDisplay, \
                        precision_score,recall_score,f1_score,roc_auc_score,roc_curve
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
import pickle
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    

  from pandas import MultiIndex, Int64Index


#### Nodes and edges

In [None]:
df_classes = pd.read_csv('../elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
df_classes.head()

In [None]:
df_classes.info()

In [None]:
df_classes.shape

In [None]:
df_classes['class'].value_counts()


The graph is made of 203,769 nodes and 234,355 edges. Two percent (4,545) of the nodes are labelled class1 (illicit). Twenty-one percent (42,019) are labelled class2 (licit). The remaining transactions are not labelled with regard to licit versus illicit.

In [None]:
df_edgelist = pd.read_csv('../elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
df_edgelist.head()

In [None]:
df_edgelist.shape

In [None]:
df_edgelist.info()

### Features

In [None]:
df_features = pd.read_csv('../elliptic_bitcoin_dataset/elliptic_txs_features.csv',header = None )
df_features.head()

In [None]:
df_features.shape

In [None]:
df_features.info()

## EDA

In [None]:
df_feat_missing = df_features.isna().sum().reset_index().rename(columns = {0:'no.of missing values'})
df_feat_missing.head()

In [None]:
df_feat_missing[df_feat_missing['no.of missing values']!=0].count()

In [None]:
df_features[1].unique()

There are 166 features associated with each node. Due to intellectual property issues, we cannot provide an exact description of all the features in the dataset. There is a time step associated to each node, representing a measure of the time when a transaction was broadcasted to the Bitcoin network. The time steps, running from 1 to 49, are evenly spaced with an interval of about two weeks. Each time step contains a single connected component of transactions that appeared on the blockchain within less than three hours between each other; there are no edges connecting the different time steps.

The first 94 features represent local information about the transaction – including the time step described above, number of inputs/outputs, transaction fee, output volume and aggregated figures such as average BTC received (spent) by the inputs/outputs and average number of incoming (outgoing) transactions associated with the inputs/outputs. The remaining 72 features are aggregated features, obtained using transaction information one-hop backward/forward from the center node - giving the maximum, minimum, standard deviation and correlation coefficients of the neighbour transactions for the same information data (number of inputs/outputs, transaction fee, etc.).

In [None]:
# based on the given data
col = ['txid','timestamp'] + ["trans_feat_{0}".format(i) for i in range(1,94)] + ['aggre_feat_{}'.format(j) for j in range(1,73)]
print('sample:',col[:5])
print('No.of columns :', len(col))

In [None]:
df_features.columns = col
df_features.head()

In [None]:
df_features['timestamp'].value_counts().sort_index().plot()
plt.title('No.of transactions in different time stamp')

Let's split the transaction based on the classes

In [None]:
df_merge = pd.merge(df_features,df_classes, left_on = 'txid', right_on = 'txId', how = 'left')
df_merge = df_merge.drop(['txId'],axis = 1)
df_merge.head()

In [None]:
df_group = df_merge.groupby(['timestamp','class'])['txid'].count().reset_index().rename(columns = {'txid':'count'})
df_group.head()

In [None]:
plt.figure(figsize = (8,6))
sns.lineplot(data = df_group , x = 'timestamp',y ='count',hue = 'class')
plt.title('No.of transactions in different time stamp by class')
plt.legend(loc = (1,0.85))
plt.show()

In [None]:
df_merge_missing = df_merge.isna().sum().reset_index().rename(columns = {0:'no.of missing values'})
df_merge_missing.head()

In [None]:
df_merge_missing[df_merge_missing['no.of missing values']!=0].count()

There are no missing values present in the merge data

In [None]:
ilicit_ids = df_merge.loc[(df_merge['timestamp'] == 20) & (df_merge['class'] == '1'), 'txid']
ilicit_edges = df_edgelist.loc[df_edgelist['txId1'].isin(ilicit_ids)]

graph = nx.from_pandas_edgelist(ilicit_edges, source = 'txId1', target = 'txId2', )
pos = nx.spring_layout(graph)
nx.draw_networkx(graph, with_labels=True,pos=pos)

In [None]:
df_merge['class'].unique()

In [None]:
df_merge['class'].value_counts().plot(kind = 'bar', title = 'class feature')

In [None]:
for idx,cal in enumerate(df_merge['class'].unique()):
    print(cal, 'percentage :%.2f' %(df_merge['class'].value_counts()[cal]/df_merge.shape[0]))

Here , we can observe that 77% of the data is labled and the other 23% of the data is labelled class1 (illicit) and labelled class2 (illicit).So, first we need to perform with labled data.

In [None]:
df_save_labled = df_merge[df_merge['class']!="unknown"]
df_save_labled.head()

In [None]:
df_save_labled.shape

In [None]:
df_save_labled.to_csv("labled_data.csv",index = False)

In [None]:
# Converting the categorical feature into numerical feature
df_merge['class'] = df_merge['class'].replace({'unknown':2,'2':0,'1':1})
df_merge.head()

In [None]:
#labled data
df_labled = df_merge[df_merge['class']!=2]
df_labled.head()

In [None]:
df_labled.shape

In [None]:
df_labled['class'].unique()

In [None]:
df_labled['class'].value_counts().plot(kind = 'bar',title = 'labled imbalance data')

In [None]:
df_labled['class'].value_counts()

In [None]:
X = df_labled.drop(['txid','class'],axis =1)
X.head()

In [None]:
y = df_labled['class']
y.head()

#### Create Functions for model training evaluation

In [None]:
def evaluate_clf(y_true,y_predicted):
    '''
    This function takes y_true and y_predicted values 
    Return: Accuracy,F1-score, Precision, Recall,Roc-auc score
    '''
    acc = accuracy_score(y_true,y_predicted) # calculate accuracy
    f1 = f1_score(y_true,y_predicted) # calculate f1-score
    precision = precision_score(y_true,y_predicted)  # calculate precision
    recall = recall_score(y_true,y_predicted) # calculate recall
    roc_auc = roc_auc_score(y_true,y_predicted) # calculate roc and auc score
    return acc,f1,precision,recall,roc_auc

In [None]:
def total_cost(y_true,y_predicted):
    '''
    This function takes y_true , y_predicted
    Returns: total cost due to missclassification
    '''
    tn,fp,fn,tp = confusion_matrix(y_true,y_predicted).ravel()
    cost = 10*fp + 500*fn
    return cost

In [None]:
def evaluate_models(X,y, models):
    '''
    This function takes in X , and models dictionary as input
    It splits the data into Train Test Split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: DataFrame which contains report of all models with cose
    '''
    # split the data into train and test
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
    
    cost_list = []
    models_list = []
    accuracy_list = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,y_train) # train the model
        
        # make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred  = model.predict(X_test)
        
        # training the set performance 
        model_train_accuracy, model_train_f1,model_train_precision,\
        model_train_recall, model_train_roauc_score = evaluate_clf(y_train,y_train_pred)
        train_cost = total_cost(y_train,y_train_pred)
        
        # test set performance
        model_test_accuracy, model_test_f1,model_test_precision,\
        model_test_recall, model_test_roauc_score =evaluate_clf(y_test,y_test_pred)
        test_cost = total_cost(y_test,y_test_pred)
        
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])
        
        print('Model Performance for trainig test')
        print('- Accuracy: {:.4f}' .format(model_train_accuracy))
        print('- F1 score: {:.4f}' .format(model_train_f1))
        print('- Precision: {:.4f}' .format(model_train_precision))
        print('- Recall: {:.4f}' .format(model_train_recall))
        print('- Roc Auc score: {:.4f}' .format(model_train_roauc_score))
        print(f'- COST: {train_cost}.' )
        
        print('-----------------------------------------')
        
        print('Model Performance for test test')
        print('- Accuracy: {:.4f}' .format(model_test_accuracy))
        print('- F1 score: {:.4f}' .format(model_test_f1))
        print('- Precision: {:.4f}' .format(model_test_precision))
        print('- Recall: {:.4f}' .format(model_test_recall))
        print('- Roc Auc score: {:.4f}' .format(model_test_roauc_score))
        print(f'- COST: {test_cost}.')
        cost_list.append(test_cost)
        print('='*35)
        print('\n')
    report = pd.DataFrame(list(zip(models_list,cost_list)),columns = ['Model Name', 'Cost']).sort_values(by = ['Cost'])
    
    return report  

#### Handling imbalanced data

In [None]:
# resampling the minority class.The strategy can be changed as required
smt = SMOTETomek(random_state = 42,sampling_strategy = 'minority', n_jobs = -1)
# Fit the model to generate the data
X_res,y_res = smt.fit_resample(X,y)

### Intialize Default models in a dictionary 

In [None]:
# Dictionary which contains models for ecperiment
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistice Regression': LogisticRegression(),
    'K-Neighbors Classifier': KNeighborsClassifier(),
    'XGBoosting Classifier': XGBClassifier(),
    'CatBoosting Classifier': CatBoostClassifier(verbose = False),
    'AdaBoost Classifier': AdaBoostClassifier()
}

In [None]:
## Training all models
report = evaluate_models(X_res,y_res,models)

In [None]:
report

Based on the performance report, it appears that the XGBoost classifier has a better performance on the test set data compared to the K-Neighbors classifier, with a higher accuracy, F1 score, precision, recall, and ROC AUC score. 
But the XGBoost classifier has a much higher cost (25160) compared to the K-Neighbors classifier (11640).
However, for our use case, we achieve a minmum False Positive Rate in XGBoost Classifier. 
So, the final best model is <b>XGBoost Classifier</b>

### Fitting the Final Model ang get reports

In [None]:
final_model = XGBClassifier()

# Resampling the minority class
smt = SMOTETomek(random_state = 42, sampling_strategy = 'minority',n_jobs=1)
X_res, y_res = smt.fit_resample(X,y)

In [None]:
X_res.shape,y_res.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_res,y_res, test_size = 0.2,random_state = 42)

final_model = final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)

In [None]:
print('Final XGBoost Classifier Accuracy Score (Train) :', final_model.score(X_train,y_train))
print('Final XGBoost Classifier Accuracy Score (Test) :', accuracy_score(y_pred,y_test))

In [None]:
# plots confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels= model.classes_)
disp.plot()
plt.show()

<b>The best Model is XGBoost Classifier with 99.6% accuracy and cost 25160.</b>

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model,f)

In [None]:
pickled_model = pickle.load(open('model.pkl', 'rb'))
y_pred = pickled_model.predict(X_test.values)

In [None]:
print('Test the model after saving and get the accuracy of : {:.2f}%'.format(accuracy_score(y_pred,y_test)*100))