In [16]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from itertools import combinations
import numpy as np
from node2vec import Node2Vec
from community import community_louvain
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter

In [17]:
inventories = pd.read_csv('inventories.csv')
inventory_parts = pd.read_csv('inventory_parts.csv')
inventory_sets = pd.read_csv('inventory_sets.csv')
part_categories = pd.read_csv('part_categories.csv')
parts = pd.read_csv('parts.csv')
sets = pd.read_csv('sets.csv')
themes = pd.read_csv('themes.csv')
colors = pd.read_csv('colors.csv')

In [18]:
# Step 1: Data Preprocessing
# Merge datasets to get set metadata
sets_themes = sets.merge(themes, left_on='theme_id', right_on='id', suffixes=('_set', '_theme'))
sets_themes = sets_themes.rename(columns={'id': 'theme_id_actual'})

# Link inventories to sets (use latest version)
inventories = inventories.sort_values('version').drop_duplicates('set_num', keep='last')
sets_inventories = sets_themes.merge(inventories, on='set_num', how='left')

# Merge with inventory_parts to get part details
set_parts = sets_inventories.merge(inventory_parts, left_on='id', right_on='inventory_id', how='left')
set_parts = set_parts.merge(parts, on='part_num', how='left')
set_parts = set_parts.merge(part_categories, left_on='part_cat_id', right_on='id', suffixes=('_part', '_cat'))
set_parts = set_parts.merge(colors, left_on='color_id', right_on='id', suffixes=('_part', '_color'))

# Filter out sets with no part data
valid_sets = set_parts[set_parts['part_num'].notnull()]['set_num'].unique()
set_parts = set_parts[set_parts['set_num'].isin(valid_sets)]
sets_inventories = sets_inventories[sets_inventories['set_num'].isin(valid_sets)]

In [19]:
# Step 2: Build Global Part Graph
def build_part_graph(set_parts):
    G = nx.Graph()
    unique_parts = set_parts['part_num'].unique()
    for part in unique_parts:
        G.add_node(part)
    
    set_groups = set_parts.groupby('set_num')['part_num'].apply(list)
    for parts in set_groups:
        for part1, part2 in combinations(parts, 2):
            if G.has_edge(part1, part2):
                G[part1][part2]['weight'] += 1
            else:
                G.add_edge(part1, part2, weight=1)
    
    return G

In [20]:
part_graph = build_part_graph(set_parts)
print(f"\nGlobal Part Graph: {part_graph.number_of_nodes()} nodes, {part_graph.number_of_edges()} edges")



Global Part Graph: 23113 nodes, 3821261 edges


In [21]:
# Louvain Community Detection
partition = community_louvain.best_partition(part_graph, weight='weight')
part_communities = pd.DataFrame({
    'part_num': list(partition.keys()),
    'community': list(partition.values())
})
print("\nLouvain Communities:")
print(part_communities.head())


Louvain Communities:
  part_num  community
0    29c01          0
1    3001a          0
2    3002a          0
3     3003          0
4     3004          0


In [22]:
part_communities

Unnamed: 0,part_num,community
0,29c01,0
1,3001a,0
2,3002a,0
3,3003,0
4,3004,0
...,...,...
23108,973pb1122c01,0
23109,satchel4,293
23110,973pb1144c01,0
23111,973pb1145c01,0


In [23]:
part_nodes = list(part_graph.nodes)

# Spectral Clustering
n_clusters = 10
spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
adj_matrix = nx.to_numpy_array(part_graph, weight='weight')
spectral_labels = spectral.fit_predict(adj_matrix)
spectral_communities = pd.DataFrame({
    'part_num': part_nodes,
    'spectral_cluster': spectral_labels
})
print("\nSpectral Clusters:")
print(spectral_communities.head())




Spectral Clusters:
  part_num  spectral_cluster
0    29c01                 0
1    3001a                 0
2    3002a                 0
3     3003                 0
4     3004                 0


In [24]:
# Step 4: Predict Parts for a Theme
def predict_parts_global(theme_id, set_parts, communities_df, method='louvain'):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    theme_parts = set_parts[set_parts['set_num'].isin(theme_sets)]['part_num'].unique()
    
    community_col = 'community' if method == 'louvain' else 'kmeans_cluster' if method == 'kmeans' else 'spectral_cluster'
    theme_communities = communities_df[communities_df['part_num'].isin(theme_parts)][community_col]
    
    top_communities = Counter(theme_communities).most_common(3)
    top_community_ids = [comm for comm, _ in top_communities]
    
    predicted_parts = communities_df[communities_df[community_col].isin(top_community_ids)]['part_num'].tolist()
    return predicted_parts

# Step 5: Evaluate Predictions
def evaluate_predictions(theme_id, set_parts, communities_df, method='louvain'):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    true_parts = set_parts[set_parts['set_num'].isin(theme_sets)]['part_num'].unique()
    predicted_parts = predict_parts_global(theme_id, set_parts, communities_df, method)
    
    all_parts = set_parts['part_num'].unique()
    y_true = [1 if part in true_parts else 0 for part in all_parts]
    y_pred = [1 if part in predicted_parts else 0 for part in all_parts]
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return {'precision': precision, 'recall': recall, 'f1': f1}

# Baseline: Most frequent parts
def baseline_predict(theme_id, set_parts, top_n=100):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    theme_parts = set_parts[set_parts['set_num'].isin(theme_sets)]
    top_parts = theme_parts['part_num'].value_counts().head(top_n).index.tolist()
    return top_parts

In [25]:
def grade(theme_id, global_communities):
    results = []

    # Global Part Graph
    
    for method in ['louvain','spectral']:
        if not global_communities[method].empty:
            metrics = evaluate_predictions(theme_id, set_parts, global_communities[method], method)
            results.append({
                'Approach': 'Global',
                'Method': method.capitalize(),
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1': metrics['f1']
            })

    # Baseline
    baseline_parts = baseline_predict(theme_id, set_parts)
    y_true = [1 if part in set_parts[set_parts['set_num'].isin(sets[sets['theme_id'] == theme_id]['set_num'])]['part_num'].unique() else 0 for part in set_parts['part_num'].unique()]
    y_pred_baseline = [1 if part in baseline_parts else 0 for part in set_parts['part_num'].unique()]
    baseline_metrics = {
        'precision': precision_score(y_true, y_pred_baseline, zero_division=0),
        'recall': recall_score(y_true, y_pred_baseline, zero_division=0),
        'f1': f1_score(y_true, y_pred_baseline, zero_division=0)
    }
    results.append({
        'Approach': 'Baseline',
        'Method': 'Most Frequent',
        'Precision': baseline_metrics['precision'],
        'Recall': baseline_metrics['recall'],
        'F1': baseline_metrics['f1']
    })

    # Display results
    results_df = pd.DataFrame(results)
    print("\nGlobal Approach Results:")
    print(results_df.to_string(index=False, float_format="{:.3f}".format))

In [26]:
global_communities = {
        'louvain': part_communities,
        'spectral': spectral_communities
    }

grade(1, global_communities)




Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.043   0.999 0.082
  Global      Spectral      0.037   1.000 0.071
Baseline Most Frequent      1.000   0.118 0.212


In [27]:
# Spectral Clustering
n_clusters = 20
spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
adj_matrix = nx.to_numpy_array(part_graph, weight='weight')
spectral_labels = spectral.fit_predict(adj_matrix)
spectral_communities = pd.DataFrame({
    'part_num': part_nodes,
    'spectral_cluster': spectral_labels
})
print("\nSpectral Clusters:")
print(spectral_communities.head())




Spectral Clusters:
  part_num  spectral_cluster
0    29c01                 0
1    3001a                 0
2    3002a                 0
3     3003                 0
4     3004                 0


In [28]:
partition = community_louvain.best_partition(part_graph, weight='weight', resolution=1.5)
part_communities = pd.DataFrame({
    'part_num': list(partition.keys()),
    'community': list(partition.values())
})
print("\nLouvain Communities:")
print(part_communities.head())


Louvain Communities:
  part_num  community
0    29c01         75
1    3001a         75
2    3002a         75
3     3003         75
4     3004         75


In [29]:
grade(1, global_communities)


Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.043   0.999 0.082
  Global      Spectral      0.037   1.000 0.071
Baseline Most Frequent      1.000   0.118 0.212


In [30]:
for i in range(10):
    grade(i, global_communities)



Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.000   0.000 0.000
  Global      Spectral      0.000   0.000 0.000
Baseline Most Frequent      0.000   0.000 0.000

Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.043   0.999 0.082
  Global      Spectral      0.037   1.000 0.071
Baseline Most Frequent      1.000   0.118 0.212

Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.006   1.000 0.013
  Global      Spectral      0.005   1.000 0.011
Baseline Most Frequent      1.000   0.806 0.893

Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.012   1.000 0.024
  Global      Spectral      0.011   1.000 0.021
Baseline Most Frequent      1.000   0.412 0.583

Global Approach Results:
Approach        Method  Precision  Recall    F1
  Global       Louvain      0.008   1.000 0.016
  Glob