In [2]:
import pandas as pd
from graphdatascience import GraphDataScience
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()

True

In [4]:
gds = GraphDataScience(
    os.getenv("NEO4J_URI"),
    auth=(os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASS"))
)
gds.set_database("neo4j")

In [7]:
def clear_graph(g_name):
    if gds.graph.exists(g_name).exists:
        gds.graph.get(g_name).drop()

In [9]:
clear_graph('wcc')

g, _ = gds.graph.project('wcc', ['Client'], {
        "SHARED_IDENTIFIERS":{
            "type": 'SHARED_IDENTIFIERS',
            "orientation": 'UNDIRECTED',
            "properties": {
                "count": {
                    "property": 'count'
                }
            }
        }
    })

df = gds.wcc.write(g, writeProperty='wccId')

gds.run_cypher(
    """
CALL gds.wcc.stream('wcc',
    {
        nodeLabels: ['Client'],
        relationshipTypes: ['SHARED_IDENTIFIERS'],
        consecutiveIds: true
    }
)
YIELD componentId, nodeId
WITH componentId AS cluster, gds.util.asNode(nodeId) AS client
WITH cluster, collect(client.id) AS clients
WITH cluster, clients, size(clients) AS clusterSize WHERE clusterSize > 1
UNWIND clients AS client
MATCH (c:Client) WHERE c.id = client
SET c.firstPartyFraudGroup=cluster;
SET c.fraudRisk = 1;    
    """
)

In [11]:
gds.run_cypher(
    """
    MATCH (c:Client)-[p:PERFORMED]->(t:Transaction)-[]-(d)
    WHERE t.fraud=True
    SET c.fraudRisk = 1
    SET c.fraudMoney = c.fraudMoney + t.amount;
    """
)

In [14]:
clear_graph("centrality")

#project
g, _ = gds.graph.project('centrality', ['Client', 'Email', 'SSN', 'Phone'], ['HAS_EMAIL', 'HAS_SSN', 'HAS_PHONE'])

# card count/degree
gds.degree.mutate(g, nodeLabels=['Client', 'Email'], relationshipTypes=['HAS_EMAIL'], mutateProperty='emailDegree')

# device count/degree
gds.degree.mutate(g, nodeLabels=['Client', 'SSN'], relationshipTypes=['HAS_SSN'], mutateProperty='ssnDegree')

# IP count/degree
gds.degree.mutate(g, nodeLabels=['Client', 'Phone'], relationshipTypes=['HAS_PHONE'], mutateProperty='phoneDegree')

# write node properties
gds.graph.writeNodeProperties(g, ['emailDegree', 'ssnDegree', 'phoneDegree'], ['Client'])

g.drop()

graphName                                                       centrality
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                             9134
relationshipCount                                                     7299
configuration            {'relationshipProjection': {'HAS_SSN': {'aggre...
density                                                           0.000087
creationTime                           2024-05-04T08:55:17.934145400+00:00
modificationTime                       2024-05-04T08:55:18.161317100+00:00
schema                   {'graphProperties': {}, 'nodes': {'Phone': {'p...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Phone': {'p...
Name: 0, dtype: object

In [101]:
clear_graph("pageRanks")

g, _ = gds.graph.project('pageRanks', ['Client'], {
    "PERFORMED": {
        "type": 'PERFORMED',
        "orientation": 'NATURAL'
    },
    "PERFORMED_REVERSE" : {
        "type": 'PERFORMED',
        "orientation": 'REVERSE'
    }
})

gds.pageRank.write(g, maxIterations=1000, relationshipTypes=['PERFORMED'], writeProperty='TransactionsPageRank')
gds.pageRank.write(g, maxIterations=1000, relationshipTypes=['PERFORMED_REVERSE'], writeProperty='revTransactionsPageRank')

g.drop()


graphName                                                        pageRanks
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                             2433
relationshipCount                                                        0
configuration            {'relationshipProjection': {'PERFORMED_REVERSE...
density                                                                0.0
creationTime                           2024-05-07T07:03:24.651469200+00:00
modificationTime                       2024-05-07T07:03:24.880033300+00:00
schema                   {'graphProperties': {}, 'nodes': {'Client': {}...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Client': {}...
Name: 0, dtype: object

In [102]:
df = gds.run_cypher("""
MATCH (c:Client)
RETURN c.id AS id, 
    c.wccId AS wccId,
    c.fraudRisk AS fraudRisk, 
    c.fraudMoneyTransfer AS fraudMoneyTransfer, 
    c.emailDegree AS emailDegree, 
    c.ssnDegree AS ssnDegree, 
    c.phoneDegree AS phoneDegree,
    c.TransactionsPageRank AS TransactionsPageRank,
    c.revTransactionsPageRank AS revTransactionsPageRank,
    c.partOfCommunity AS partOfCommunity,
    c.communitySize AS communitySize,
    c.firstPartyFraudScore AS firstPartyFraudScore
""")
df

Unnamed: 0,id,wccId,fraudRisk,fraudMoneyTransfer,emailDegree,ssnDegree,phoneDegree,TransactionsPageRank,revTransactionsPageRank,partOfCommunity,communitySize,firstPartyFraudScore
0,4997933060327094,0,0,0,1.0,1.0,1.0,0.15,0.15,,,0.0
1,4776276949898423,1,0,0,1.0,1.0,1.0,0.15,0.15,,,0.0
2,4858607188760216,2,0,0,1.0,1.0,1.0,0.15,0.15,,,0.0
3,4287186486553145,3,1,1,1.0,1.0,1.0,0.15,0.15,,,0.0
4,4661202154682409,4,0,0,1.0,1.0,1.0,0.15,0.15,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2428,4413385955087620,1770,1,1,1.0,1.0,1.0,0.15,0.15,1767.0,11.0,2.3
2429,4550448544478545,1865,1,1,1.0,1.0,1.0,0.15,0.15,1862.0,12.0,2.2
2430,4114683318919154,334,1,1,1.0,1.0,1.0,0.15,0.15,334.0,11.0,2.6
2431,4172817689754167,2129,1,1,1.0,1.0,1.0,0.15,0.15,2113.0,7.0,2.0


In [103]:
df['partOfCommunity'] = df['partOfCommunity'].fillna(0)
df['communitySize'] = df['communitySize'].fillna(1)
df

Unnamed: 0,id,wccId,fraudRisk,fraudMoneyTransfer,emailDegree,ssnDegree,phoneDegree,TransactionsPageRank,revTransactionsPageRank,partOfCommunity,communitySize,firstPartyFraudScore
0,4997933060327094,0,0,0,1.0,1.0,1.0,0.15,0.15,0.0,1.0,0.0
1,4776276949898423,1,0,0,1.0,1.0,1.0,0.15,0.15,0.0,1.0,0.0
2,4858607188760216,2,0,0,1.0,1.0,1.0,0.15,0.15,0.0,1.0,0.0
3,4287186486553145,3,1,1,1.0,1.0,1.0,0.15,0.15,0.0,1.0,0.0
4,4661202154682409,4,0,0,1.0,1.0,1.0,0.15,0.15,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2428,4413385955087620,1770,1,1,1.0,1.0,1.0,0.15,0.15,1767.0,11.0,2.3
2429,4550448544478545,1865,1,1,1.0,1.0,1.0,0.15,0.15,1862.0,12.0,2.2
2430,4114683318919154,334,1,1,1.0,1.0,1.0,0.15,0.15,334.0,11.0,2.6
2431,4172817689754167,2129,1,1,1.0,1.0,1.0,0.15,0.15,2113.0,7.0,2.0


In [104]:
X = df.drop(columns=['id', 'wccId', 'fraudRisk', 'fraudMoneyTransfer'])
y = df['fraudMoneyTransfer']

In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [106]:
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

clf = RandomForestClassifier(n_estimators=500, random_state=0, max_depth=10, bootstrap=True, class_weight='balanced')
clf.fit(X_train, y_train)
dump(clf, 'fraud_model.joblib')

['fraud_model.joblib']

In [107]:
y_prob = clf.predict_proba(X_test)

In [108]:
print('Accuracy of random forrest classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of random forrest classifier on test set: 0.93
