## Similarity Algorithmus
Diese Notebook demonstriert das Ausprobieren der Similarity Algorithmen auf Finanzdaten

### Verbindung zu Neo4j

In [2]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687" 
username = "neo4j"
password = ""

driver = GraphDatabase.driver(uri, auth=(username, password))


### Ausführen Jaccard Ähnlichkeitsalgoritmus

In [5]:
from neo4j import GraphDatabase

def get_jaccard_similarity(tx):
    query = """
    MATCH (p1:Person)-[:TRANSACTION]->(other)
    WITH p1, collect(id(other)) AS p1Others
    MATCH (p2:Person)-[:TRANSACTION]->(other)
    WHERE id(p1) < id(p2)  // This ensures each pair is only compared once
    WITH p1, p1Others, p2, collect(id(other)) AS p2Others
    RETURN p1.name AS person1, p2.name AS person2, gds.similarity.jaccard(p1Others, p2Others) AS similarity
    ORDER BY similarity DESC, person1, person2
    """
    result = tx.run(query)
    return [{"person1": record["person1"], "person2": record["person2"], "similarity": record["similarity"]} for record in result]


# Example usage:
with driver.session() as session:
    similarities = session.execute_read(get_jaccard_similarity)

import pandas as pd
df_similarities = pd.DataFrame(similarities)
print(df_similarities)
# Zähle die Anzahl der Ähnlichkeiten, die gleich 1 sind
count_1 = df_similarities[df_similarities['similarity'] == 1.0].shape[0]

# Zähle die Anzahl der Ähnlichkeiten, die nicht gleich 1 sind
count_not_1 = df_similarities[df_similarities['similarity'] != 1.0].shape[0]
print("Anzahl der Ähnlichkeiten mit Wert 1:", count_1)
print("Anzahl der Ähnlichkeiten mit Wert ungleich 1:", count_not_1)


              person1            person2  similarity
0     Evelyn Hartmann         John Smith    1.000000
1         Mia Schäfer     Henry Schröder    0.666667
2    Olivia Schneider  Alexander Neumann    0.666667
3            Jane Doe  Alexander Neumann    0.500000
4    Olivia Schneider      Sophia Wagner    0.500000
..                ...                ...         ...
898     William Meyer   Olivia Schneider    0.000000
899     William Meyer      Sophia Wagner    0.000000
900     William Meyer      Sophia Wagner    0.000000
901     William Meyer      Sophia Wagner    0.000000
902     William Meyer      William Meyer    0.000000

[903 rows x 3 columns]
Anzahl der Ähnlichkeiten mit Wert 1: 1
Anzahl der Ähnlichkeiten mit Wert ungleich 1: 902


#### Prozentsatz der Transaktionen unter 300 für jede Person-Personer-Kombination

In [7]:
from neo4j import GraphDatabase
import pandas as pd

uri = "bolt://localhost:7687"
user = "neo4j"
password = ""  

driver = GraphDatabase.driver(uri, auth=(user, password))

def get_transactions_under_300(tx):
    query = """
    MATCH (p:Person)-[t:TRANSACTION]->(other:Person)
    WHERE t.amount < 300
    RETURN p.name AS personName, other.name AS partnerName, COUNT(t) AS transactionsUnder300
    ORDER BY p.name, other.name
    """
    result = tx.run(query)
    return [{"personName": record["personName"], "partnerName": record["partnerName"], "transactionsUnder300": record["transactionsUnder300"]} for record in result]

def get_total_transactions(tx):
    query = """
    MATCH (p:Person)-[t:TRANSACTION]->(other:Person)
    RETURN p.name AS personName, other.name AS partnerName, COUNT(t) AS totalTransactions
    ORDER BY p.name, other.name
    """
    result = tx.run(query)
    return [{"personName": record["personName"], "partnerName": record["partnerName"], "totalTransactions": record["totalTransactions"]} for record in result]

with driver.session() as session:
    transactions_under_300 = session.read_transaction(get_transactions_under_300)
    total_transactions = session.read_transaction(get_total_transactions)


df_under_300 = pd.DataFrame(transactions_under_300)
df_total = pd.DataFrame(total_transactions)


merged_df = pd.merge(df_under_300, df_total, on=['personName', 'partnerName'])


merged_df['percentageUnder300'] = (merged_df['transactionsUnder300'] / merged_df['totalTransactions']) * 100


print(merged_df[['personName', 'partnerName', 'transactionsUnder300', 'totalTransactions', 'percentageUnder300']])


  transactions_under_300 = session.read_transaction(get_transactions_under_300)
  total_transactions = session.read_transaction(get_total_transactions)


     personName    partnerName  transactionsUnder300  totalTransactions  \
0     Ava Weber  William Meyer                    11                 12   
1  James Becker   Liam Schmidt                     9                 10   
2  James Becker  Oliver Schulz                     9                  9   

   percentageUnder300  
0           91.666667  
1           90.000000  
2          100.000000  


In [13]:
analysis_requests = []
for index, row in merged_df.iterrows():
    analysis_request = f"""
    Analyze transactions pattern for {row['personName']} and {row['partnerName']}:
    - Transactions under 300: {row['transactionsUnder300']}
    - Total transactions: {row['totalTransactions']}
    - Percentage under 300: {row['percentageUnder300']:.2f}%
    Does this pattern indicate potential money laundering activities?
    """
    analysis_requests.append(analysis_request.strip())

print(analysis_requests[0])


Analyze transactions pattern for Ava Weber and William Meyer:
    - Transactions under 300: 11
    - Total transactions: 12
    - Percentage under 300: 91.67%
    Does this pattern indicate potential money laundering activities?


In [20]:
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Beispielabfrage: Finden Sie alle Personen und ihre Transaktionen
query = """
MATCH (p:Person)-[t:TRANSACTION]->()
WITH p, AVG(t.amount) AS avgAmount
WHERE avgAmount > 10000  // 
RETURN p.name AS personName, avgAmount
ORDER BY avgAmount DESC
"""
results = run_query(query)

for record in results:
    print(record)



<Record personName='Noah Fischer' avgAmount=1671432.3333333335>
<Record personName='Isabella Hoffmann' avgAmount=852435.8333333334>
<Record personName='Olivia Schneider' avgAmount=334131.6666666666>
<Record personName='Mia Schäfer' avgAmount=55816.5>
<Record personName='Noah Fischer' avgAmount=52930.0>
<Record personName='Max Mustermann' avgAmount=50000.0>
<Record personName='John Smith' avgAmount=30000.0>
<Record personName='Oliver Schulz' avgAmount=21789.0>


In [22]:
from py2neo import Graph
graph = Graph("bolt://localhost:7687", auth=("neo4j", ""))

jaccard_query = """
CALL gds.nodeSimilarity.stream('myGraphConnections')
YIELD node1, node2, similarity
RETURN gds.util.asNode(node1).name AS node1, gds.util.asNode(node2).name AS node2, similarity
ORDER BY similarity DESC
"""

jaccard_results = graph.run(jaccard_query).data()

for result in jaccard_results:
    print(result)


{'node1': 'Olivia Schneider', 'node2': 'Noah Fischer', 'similarity': 0.5}
{'node1': 'Noah Fischer', 'node2': 'Olivia Schneider', 'similarity': 0.5}
{'node1': 'Emma Müller', 'node2': 'Harper Krüger', 'similarity': 0.5}
{'node1': 'Ava Weber', 'node2': 'Harper Krüger', 'similarity': 0.5}
{'node1': 'Harper Krüger', 'node2': 'Ava Weber', 'similarity': 0.5}
{'node1': 'Harper Krüger', 'node2': 'Emma Müller', 'similarity': 0.5}
{'node1': 'Charlotte Bauer', 'node2': 'Alexander Neumann', 'similarity': 0.4}
{'node1': 'Alexander Neumann', 'node2': 'Charlotte Bauer', 'similarity': 0.4}
{'node1': 'Alexander Neumann', 'node2': 'Olivia Schneider', 'similarity': 0.4}
{'node1': 'Emma Müller', 'node2': 'Harper Krüger', 'similarity': 0.4}
{'node1': 'William Meyer', 'node2': 'Mia Schäfer', 'similarity': 0.4}
{'node1': 'Benjamin Koch', 'node2': 'Henry Schröder', 'similarity': 0.4}
{'node1': 'Charlotte Bauer', 'node2': 'Alexander Neumann', 'similarity': 0.4}
{'node1': 'Mia Schäfer', 'node2': 'William Meyer',

Ouput Deutung: <br>
Similarity: Der Wert liegt zwischen 0 und 1, wobei 1 eine maximale Ähnlichkeit darstellt <br>

Zwischen 'Olivia Schneider' und 'Noah Fischer' beträgt die Ähnlichkeit 0.5. Das bedeutet, dass die Nachbarschaften dieser beiden Knoten zur Hälfte ähnlich sind.

Zwischen 'Emma Müller' und 'Harper Krüger' beträgt die Ähnlichkeit ebenfalls 0.5. Das bedeutet, dass die Nachbarschaften dieser beiden Knoten zur Hälfte ähnlich sind.

Zwischen 'Charlotte Bauer' und 'Alexander Neumann' beträgt die Ähnlichkeit 0.4. Das bedeutet, dass die Nachbarschaften dieser beiden Knoten zu 40% ähnlich sind.

### Selbsterstelltes Ähnlichkeitsmuster
Prozentsatz der Transaktionen unter 300 für jede Person-Person-Kombination

In [None]:
from neo4j import GraphDatabase
import pandas as pd
import openai

class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        self.driver.close()

    def run_query(self, query):
        with self.driver.session() as session:
            result = session.run(query)
            return [record.data() for record in result]

class OpenAIWrapper:
    def __init__(self, api_key):
        self.api_key = api_key

    def generate_chat_response(self, prompt):
        openai.api_key = self.api_key
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": "Analyze the following transaction patterns for potential money laundering:"},
                      {"role": "user", "content": prompt}]
        )
        return response.choices[0].message['content']

def analyze_transactions(uri, user, password, api_key, person_name=None):
    neo4j_conn = Neo4jConnection(uri, user, password)
    person_filter = f"WHERE p.name = '{person_name}'" if person_name else ""
    query_transactions_under_300 = f"""
        MATCH (p:Person)-[t:TRANSACTION]->(other:Person)
        {person_filter}
        AND t.amount < 300
        RETURN p.name AS personName, other.name AS partnerName, COUNT(t) AS transactionsUnder300
        ORDER BY p.name, other.name
    """
    query_total_transactions = f"""
        MATCH (p:Person)-[t:TRANSACTION]->(other:Person)
        {person_filter}
        RETURN p.name AS personName, other.name AS partnerName, COUNT(t) AS totalTransactions
        ORDER BY p.name, other.name
    """
    transactions_under_300 = neo4j_conn.run_query(query_transactions_under_300)
    total_transactions = neo4j_conn.run_query(query_total_transactions)

    df_under_300 = pd.DataFrame(transactions_under_300)
    df_total = pd.DataFrame(total_transactions)

    merged_df = pd.merge(df_under_300, df_total, on=['personName', 'partnerName'])
    filtered_df = merged_df[merged_df['transactionsUnder300'] >= 6]
    filtered_df['percentageUnder300'] = (filtered_df['transactionsUnder300'] / filtered_df['totalTransactions']) * 100

    openai_wrapper = OpenAIWrapper(api_key)
    insights = []

    for index, row in filtered_df.iterrows():
        analysis_request = f"""
        Analyze transactions pattern for {row['personName']} and {row['partnerName']}:
        - Transactions under 300: {row['transactionsUnder300']}
        - Total transactions: {row['totalTransactions']}
        - Percentage under 300: {row['percentageUnder300']:.2f}%
        Does this pattern indicate potential money laundering activities?
        """
        response = openai_wrapper.generate_chat_response(analysis_request)
        insights.append(response)

    neo4j_conn.close()
    return insights

# Connection GPT und Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = ""
api_key = ''
person_name = "Ava Weber"

results = analyze_transactions(uri, user, password, api_key, person_name)
for result in results:
    print(result)
