In [1]:
import pandas as pd
import networkx as nx

In [2]:
dataset = pd.read_csv('_SELECT_RMTC_ACC_CD_AS_FROM_ACC_ACC_CD_AS_TO_ACC_RMTC_BNK_ID_cou_202203171620.csv', 
                      dtype={"FROM_ACC": "string", "TO_ACC": "string", "RMTC_BNK_ID": "string", "NUM": int, "AMT":float })

dataset = dataset.loc[(dataset["FROM_ACC"] != '-100') & (dataset["FROM_ACC"] != '-200') & (dataset["TO_ACC"] != '-100') & (dataset["TO_ACC"] != '-200')].reset_index(drop=True)

In [3]:
def calculate_weights(data, acc_from, acc_to, agg_weight, weight = 'AMT'):
    # put median as weight
    if weight == 'AMT':
        
        data['from_to'] = list(zip(data[acc_from], data[acc_to]))

        sum_amount = data.groupby(['from_to'])[agg_weight].sum().reset_index()

        sum_amount['from'], sum_amount['to'] = zip(*sum_amount.from_to)

        # make tuple with weight
        edges_weights = list(zip(sum_amount['from'], sum_amount['to'], sum_amount[agg_weight])) 
        
    return edges_weights

In [4]:
import networkx as nx

def create_network(data, nodes_from, nodes_to, weight = None, prints = False, graph_type = nx.Graph()):
    
    G = graph_type
    # create the graph
    G.add_weighted_edges_from(calculate_weights(data, nodes_from, nodes_to, weight))
    
    # store the nodes and edges
    nodes, edges = list(G.nodes()), list(G.edges())
    
    # print the properties of the network
    if prints:
        
        print('The network is{} weakly connected'.format('' if nx.is_weakly_connected(G) else ' not'))
        print('The number of weakly connected components of the network are: {:,d}'.format(nx.number_weakly_connected_components(G)))
        print('The network is{} strongly connected'.format('' if nx.is_strongly_connected(G) else ' not'))
        print('The number of strongly connected components of the network are : {:,d}'.format(nx.number_strongly_connected_components(G)))

    if type(graph_type) == nx.DiGraph():
        print('The network is{} connected'.format('' if nx.is_connected(G) else ' not'))

    return (G, nodes, edges)

In [5]:
G, nodes, edges = create_network(dataset, 'FROM_ACC', 'TO_ACC', 'AMT',  prints=False, graph_type = nx.DiGraph())

In [6]:
# remove cycle
G.remove_edges_from(list(nx.simple_cycles(G)))


#from collections import defaultdict
from itertools import chain

def DFS(G, v, seen = None,path = None):
    if seen is None: seen = []
    if path is None: path = [v]

    seen.append(v)

    paths = []
    for t in G[v]:
        if t not in seen:
            t_path = path + [t]
            paths.append(tuple(t_path))
            paths.extend(DFS(G, t, seen[:], t_path))
    return paths

# Run DFS, compute metrics

all_paths = list(chain.from_iterable(DFS(H, n) for n in set(H)))

max_len   = max(len(p) for p in all_paths)
max_paths = [p for p in all_paths if len(p) == max_len]

source, target = [], []
for i in max_paths:
    source.append(i[0])
    target.append(i[7])
    
# these accounts can have a strange bahavior since they tranfer amounts through many other accounts.
longest_path_accounts = list(set(source + target))

MemoryError: 