In [1]:
import pandas as pd
import numpy as np

import networkx as nx
from networkx.algorithms.community import k_clique_communities
import random as rd
rd.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Using sample of size 50,000 nodes
G_rwm1 = nx.read_gml(r'C:\Users\sarah\Documents\UNI\Masters\Study\Term_3\Master Project\Samples\Random Walk Sampling\G_rw1_50.gml')

In [3]:
# Confirm that Graph is Directed
nx.is_directed(G_rwm1)

True

In [4]:
# Display Size of the Graph
print(nx.info(G_rwm1))

DiGraph with 50000 nodes and 105256 edges


Confirmed that the graph meets the criterea of having more edges than nodes and maintained the directed structure.

In [5]:
# List all nodes in the sample
n_rwm1 = list(G_rwm1.nodes)

In [6]:
# Get node properties from data exploration exercise for comparison
node_properties = pd.read_csv(r'C:\Users\sarah\Documents\UNI\Masters\Study\Term_3\Master Project\node_properties_cl.csv')
node_properties[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,4666873.0,4666873.0,4666873.0,4666873.0
mean,4.494298,2.247149,2.247149,3.970074e-05
std,39.35922,19.98412,28.68249,0.0004611943
min,1.0,0.0,0.0,1.1933219999999999e-21
25%,1.0,0.0,0.0,1.1933219999999999e-21
50%,2.0,1.0,1.0,4.8926199999999995e-20
75%,3.0,1.0,2.0,1.517475e-14
max,27917.0,12116.0,25335.0,0.1642653


In [7]:
rwm1_df = nx.to_pandas_edgelist(G_rwm1) 
rwm1_df.head()

Unnamed: 0,source,target,weight
0,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,1AwMbo4nX1gjkANdVetgGK6JX1wkSvEPdt,-879264200.0
1,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,1PRNk6tbKMxiNLQuqzQKz6WhWd3Cd7DU3N,19225160.0
2,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,3KARPzxPz1UJEGQpEoK5RVbRDUVdgxBsNi,14427480.0
3,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,1PvgBZmK1QXJ5pwtzpLzNHR7yUxGht1W47,14671170.0
4,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,37JBtdPWhTd3zA77sHN8q4DY15ziBAbkY9,14882360.0


In [8]:
# Number of unique input addresses
rwm1_df['source'].nunique()

32511

In [9]:
# Number of unique output addresses
rwm1_df['target'].nunique()

31539

We observe relative balance between input and output addresses, with slightly more input addresses. In the full dataset there is also relative balance, but there were more output addresses. 

In [10]:
rwm1_df.describe()

Unnamed: 0,weight
count,105256.0
mean,22882430.0
std,418456100.0
min,-2917063000.0
25%,6316.03
50%,100000.0
75%,1487368.0
max,72994880000.0


The weight variable shows that the distribution of transaction amounts remains skewed towards high amounts and that the anomaly of negative transaction amounts was also kept.

In [11]:
# Subset by fraudulent/high-risk nodes
fr_nodes = node_properties[node_properties['fraud_flag'] == 1]
fr_nodes = fr_nodes['address'].tolist()
print('Total number of fraudulent/high-risk nodes : ', len(fr_nodes))

# Check for number of fraudulent/high-risk addresses in the sample
print('Number of fraudulent/high-risk nodes in sample : ', len(set(n_rwm1).intersection(fr_nodes)))

Total number of fraudulent/high-risk nodes :  4563
Number of fraudulent/high-risk nodes in sample :  311


These numbers suggest that random selection of nodes worsens the class imbalance problem, but when considering the counts relative to the size of the network the issue is less pronounced.

Full Network: 0.11% 

Sample: 0.62%

## Node Properties

In [12]:
# Get Properties

address = [node for (node, val) in G_rwm1.degree()]
degree = [val for (node, val) in G_rwm1.degree()]
in_degree = [val for (node, val) in G_rwm1.in_degree()]
out_degree = [val for (node, val) in G_rwm1.out_degree()]
eigen_centrality = nx.eigenvector_centrality(G_rwm1)
eigen_centrality = [eigen_centrality[node] for node in eigen_centrality]

In [13]:
# Add Properties to DataFrame

nodes = pd.DataFrame()
nodes['address'] = address
nodes['degree'] = degree
nodes['in_degree'] = in_degree
nodes['out_degree'] = out_degree
nodes['eigen_centrality'] = eigen_centrality

In [14]:
# Add fraud flag

nodes = nodes.assign(**dict.fromkeys(['fraud_flag'], 0))

for i in fr_nodes:
    nodes.loc[nodes.address == i, 'fraud_flag'] = 1
    
nodes.head()

Unnamed: 0,address,degree,in_degree,out_degree,eigen_centrality,fraud_flag
0,14f3dQbBb1gK8h5QzapYoF3vcHCvo8pVCx,6,0,6,7.421276999999999e-19,0
1,197xacSLLPbxLVMtS2zzJGh9FAovRpdK3Y,5,4,1,4.79123e-07,0
2,1F2xmpBkxhAqWEmHHKVuu3WeSSJBhYMmEu,3,3,0,1.885004e-16,0
3,1MwmRDi8ydHEPmqnxRTaYBmo7rNwmLi7Rm,2,2,0,2.211541e-16,0
4,1AA2hZ2WkTwUdjzeb1v9yMvxjkgaFsvAX,4,2,2,1.7068940000000002e-17,0


In [15]:
# Describe sample properties overall

nodes[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,50000.0,50000.0,50000.0,50000.0
mean,4.21024,2.10512,2.10512,0.0008365418
std,17.146701,6.767184,12.043262,0.004393243
min,1.0,0.0,0.0,7.421276999999999e-19
25%,1.0,0.0,0.0,7.421276999999999e-19
50%,2.0,1.0,1.0,8.905533e-18
75%,4.0,2.0,2.0,1.519878e-15
max,1941.0,575.0,1385.0,0.2331021


In [16]:
# Describe Properties of Fraudulent Nodes

nodes_fraud = nodes[nodes['fraud_flag'] == 1]
nodes_fraud[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,311.0,311.0,311.0,311.0
mean,50.720257,19.855305,30.864952,0.008412585
std,189.984742,57.842681,136.030408,0.02546176
min,1.0,0.0,0.0,7.421276999999999e-19
25%,6.0,2.0,1.0,3.794499e-15
50%,16.0,6.0,5.0,5.530292e-05
75%,43.0,22.0,20.0,0.005208526
max,1941.0,575.0,1385.0,0.2331021


In [17]:
# Describe Properties of Non-Fraudulent Nodes

nodes_licit = nodes[nodes['fraud_flag'] == 0]
nodes_licit[['degree', 'in_degree', 'out_degree', 'eigen_centrality']].describe()

Unnamed: 0,degree,in_degree,out_degree,eigen_centrality
count,49689.0,49689.0,49689.0,49689.0
mean,3.919137,1.994023,1.925114,0.0007891239
std,7.552227,4.819022,5.029082,0.003874943
min,1.0,0.0,0.0,7.421276999999999e-19
25%,1.0,0.0,0.0,7.421276999999999e-19
50%,2.0,1.0,1.0,8.905533e-18
75%,4.0,2.0,2.0,1.519878e-15
max,329.0,178.0,251.0,0.08565284


For all 3 cases the distributions of degree and centrality properties are similar to the overall network. Some notable differences are that standard deviations tend to be lower in the sample. Mean degree properties for fraudulent nodes are also lower than the full network.

## Graph Properties

In [18]:
# Density
nx.density(G_rwm1) # Connectivity remains very low

4.2103242064841296e-05

In [19]:
# Communities - k-Cliques

com_generator_rwm1 = k_clique_communities(G_rwm1.to_undirected(), k=10)
com_rwm1 = next(com_generator_rwm1)
print('The number of communities in the sample : ' + str(len(com_rwm1)))

The number of communities in the sample : 12


The sample maintains a variety of community structures. (Note: Giant Component = 107)