In [None]:
import pandas as pd
import networkx as nx
from rapidfuzz import fuzz
import matplotlib.pyplot as plt
from pyvis.network import Network

# --- Step 1: Read CSV file ---
df = pd.read_csv('mdm_company_dataset_100.csv')

In [None]:
# --- Step 2: Define similarity scoring function ---
def similarity_score(rec1, rec2):
    name_score = fuzz.token_sort_ratio(str(rec1["party_name"]), str(rec2["party_name"]))
    city_score = fuzz.ratio(str(rec1["city"]), str(rec2["city"]))
    country_score = fuzz.ratio(str(rec1["country"]), str(rec2["country"]))

    # Weighted average (you can adjust weights)
    return (name_score * 0.8 + city_score * 0.1 + country_score * 0.1)

In [None]:
# --- Step 3: Build the similarity graph ---
G = nx.Graph()

In [None]:
# --- Add full metadata as node attributes ---
for _, row in df.iterrows():
    metadata = {
        "party_name": row["party_name"],
        "status": row["status"],
        "address_type": row["address_type"],
        "address_line1": row["address_line1"],
        "address_line2": row["address_line2"],
        "postcode": row["postcode"],
        "city": row["city"],
        "country": row["country"],
        "businessid_type": row["businessid_type"],
        "business_id_value": row["business_id_value"],
        "email": row["email"],
        "role": row["role"],
        "phone": row["phone"],
        "create_date": row["create_date"],
        "update_date": row["update_date"]
    }
    G.add_node(row["party_id"], **metadata)

In [None]:
# --- Compute pairwise similarities ---
threshold = 81
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        score = similarity_score(df.iloc[i], df.iloc[j])
        if score >= threshold:
            G.add_edge(df.iloc[i]["party_id"], df.iloc[j]["party_id"], weight=score)

# --- Find clusters ---
clusters = list(nx.connected_components(G))
print(clusters)

[{'P001'}, {'P002'}, {'P003'}, {'P004', 'P005', 'P006'}, {'P008', 'P009', 'P007'}, {'P011', 'P010'}, {'P012', 'P013'}, {'P014', 'P015'}, {'P017', 'P016'}, {'P018', 'P019'}, {'P020', 'P021'}, {'P023', 'P022'}, {'P025', 'P024'}, {'P026', 'P027'}, {'P029', 'P028'}, {'P030', 'P031'}, {'P032', 'P033'}, {'P034', 'P035'}, {'P036', 'P037'}, {'P039', 'P038'}, {'P041', 'P040'}, {'P042', 'P043'}, {'P044', 'P045'}, {'P046', 'P047'}, {'P048', 'P049'}, {'P051', 'P050'}, {'P053', 'P052'}, {'P054', 'P055'}, {'P057', 'P056'}, {'P059', 'P058'}, {'P061', 'P060'}, {'P062', 'P063'}, {'P064', 'P065'}, {'P067', 'P066'}, {'P068'}, {'P069'}, {'P071', 'P070'}, {'P073', 'P072'}, {'P074'}, {'P075'}, {'P076', 'P077'}, {'P078', 'P079'}, {'P081', 'P080'}, {'P083', 'P082'}, {'P085', 'P084'}, {'P086', 'P087'}, {'P088', 'P089'}, {'P091', 'P090'}, {'P093', 'P092'}, {'P095', 'P094'}, {'P097', 'P096'}, {'P098', 'P099'}, {'P100'}]


In [None]:
# --- Print metadata for each cluster ---
for i, cluster in enumerate(clusters, 1):
    print(f"\nðŸ”· Cluster {i}:")
    for node in cluster:
        attrs = G.nodes[node]
        print(f" - {attrs['party_name']} ({attrs['city']}, {attrs['country']}) | VAT: {attrs['business_id_value']} | Role: {attrs['role']}")




ðŸ”· Cluster 1:
 - Acme Corporation Ltd (New York, USA) | VAT: 12-3456789 | Role: Lessee

ðŸ”· Cluster 2:
 - ACME CORPORATION LIMITED (New York, USA) | VAT: 123456789 | Role: Lessee

ðŸ”· Cluster 3:
 - Acme Corp (New York, USA) | VAT: 12-3456789 | Role: Lessor

ðŸ”· Cluster 4:
 - Global Tech Industries Inc (San Francisco, USA) | VAT: 98-7654321 | Role: Lessee
 - Global Tech Industries (San Francisco, USA) | VAT: 987654321 | Role: Lessee
 - GlobalTech Industries Inc. (San Francisco, USA) | VAT: 98-7654321 | Role: Supplier

ðŸ”· Cluster 5:
 - Blue Ocean Logistics LLC (Los Angeles, USA) | VAT: 456789012 | Role: Lessee
 - Bleu Ocean Logistics (Los Angeles, USA) | VAT: 45-6789012 | Role: Lessor
 - Blue Ocean Logistics (Los Angeles, USA) | VAT: 45-6789012 | Role: Lessee

ðŸ”· Cluster 6:
 - TechStart Solutions Inc (Boston, USA) | VAT: 112233445 | Role: Lessee
 - TechStart Solutions (Boston, USA) | VAT: 11-2233445 | Role: Lessee

ðŸ”· Cluster 7:
 - Sunrise Manufacturing Co (Chicago, USA) | VA

In [None]:
# --- Create interactive Pyvis network ---
import re
net = Network(notebook=True,
              cdn_resources="remote",
              height="700px",
              width="100%",
              bgcolor="#222222",
              font_color="cyan",
              select_menu=True,
              filter_menu=True,

              )

# Load nodes with metadata into pyvis
for node, data in G.nodes(data=True):
    party_name = data.get('party_name', str(node))
    tooltip = f"""<html>
              <body>
              <div>
                <div style="font-weight: bold; font-size: 1.2em;">{data.get('party_name', '')}</div>
                <div class="info-row">Role: {data.get('role', '')}</div>
                <div class="info-row">VAT: {data.get('business_id_value', '')}</div>
                <div class="info-row">City: {data.get('city', '')}</div>
                <div class="info-row">Country: {data.get('country', '')}</div>
                <div class="info-row">Email: {data.get('email', '')}</div>
                <div class="info-row">Phone: {data.get('phone', '')}</div>
              </div>
              </body>
              </html>"""
    clean_tooltip = re.sub(r'<[^>]*>', '', tooltip)
    net.add_node(node, label=party_name, title=clean_tooltip,group=data.get('role'))



# Add edges with similarity weights
for u, v, d in G.edges(data=True):
    net.add_edge(u, v, value=d["weight"], title=f"Similarity: {d['weight']:.1f}")

net.show_buttons(filter_=['physics'])

net.write_html("party_similarity_network.html")


In [None]:
from google.colab import files
files.download("party_similarity_network.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>