In [5]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from itertools import combinations
import numpy as np
from node2vec import Node2Vec
from community import community_louvain
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter

In [6]:
inventories = pd.read_csv('inventories.csv')
inventory_parts = pd.read_csv('inventory_parts.csv')
inventory_sets = pd.read_csv('inventory_sets.csv')
part_categories = pd.read_csv('part_categories.csv')
parts = pd.read_csv('parts.csv')
sets = pd.read_csv('sets.csv')
themes = pd.read_csv('themes.csv')
colors = pd.read_csv('colors.csv')

In [7]:
# Step 1: Data Preprocessing
# Merge datasets to get set metadata
sets_themes = sets.merge(themes, left_on='theme_id', right_on='id', suffixes=('_set', '_theme'))
sets_themes = sets_themes.rename(columns={'id': 'theme_id_actual'})

# Link inventories to sets (use latest version)
inventories = inventories.sort_values('version').drop_duplicates('set_num', keep='last')
sets_inventories = sets_themes.merge(inventories, on='set_num', how='left')

# Merge with inventory_parts to get part details
set_parts = sets_inventories.merge(inventory_parts, left_on='id', right_on='inventory_id', how='left')
set_parts = set_parts.merge(parts, on='part_num', how='left')
set_parts = set_parts.merge(part_categories, left_on='part_cat_id', right_on='id', suffixes=('_part', '_cat'))
set_parts = set_parts.merge(colors, left_on='color_id', right_on='id', suffixes=('_part', '_color'))

# Filter out sets with no part data
valid_sets = set_parts[set_parts['part_num'].notnull()]['set_num'].unique()
set_parts = set_parts[set_parts['set_num'].isin(valid_sets)]
sets_inventories = sets_inventories[sets_inventories['set_num'].isin(valid_sets)]

In [11]:
# Step 2: Build Global Part Graph
def build_part_graph(set_parts):
    G = nx.Graph()
    unique_parts = set_parts['part_num'].unique()
    for part in unique_parts:
        G.add_node(part)
    
    set_groups = set_parts.groupby('set_num')['part_num'].apply(list)
    for parts in set_groups:
        for part1, part2 in combinations(parts, 2):
            if G.has_edge(part1, part2):
                G[part1][part2]['weight'] += 1
            else:
                G.add_edge(part1, part2, weight=1)
    
    return G

In [12]:
part_graph = build_part_graph(set_parts)
print(f"\nGlobal Part Graph: {part_graph.number_of_nodes()} nodes, {part_graph.number_of_edges()} edges")



Global Part Graph: 23113 nodes, 3821261 edges


In [13]:
# Step 3: Global Part Graph Community Detection
def get_node2vec_embeddings(graph, dimensions=32, walk_length=30, num_walks=200, p=1, q=1):
    if graph.number_of_nodes() == 0:
        return {}
    try:
        node2vec = Node2Vec(
            graph,
            dimensions=dimensions,
            walk_length=walk_length,
            num_walks=num_walks,
            p=p,
            q=q,
            workers=1,
            quiet=True
        )
        model = node2vec.fit(window=10, min_count=1, batch_words=4)
        return {node: model.wv[str(node)] for node in graph.nodes}
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return {}

In [14]:
# Louvain Community Detection
try:
    partition = community_louvain.best_partition(part_graph, weight='weight')
    part_communities = pd.DataFrame({
        'part_num': list(partition.keys()),
        'community': list(partition.values())
    })
    print("\nLouvain Communities:")
    print(part_communities.head())
except Exception as e:
    print(f"Error in Louvain community detection: {e}")
    part_communities = pd.DataFrame({'part_num': [], 'community': []})


Louvain Communities:
  part_num  community
0    29c01          0
1    3001a          0
2    3002a          0
3     3003          0
4     3004          0


In [15]:
part_communities

Unnamed: 0,part_num,community
0,29c01,0
1,3001a,0
2,3002a,0
3,3003,0
4,3004,0
...,...,...
23108,973pb1122c01,0
23109,satchel4,202
23110,973pb1144c01,0
23111,973pb1145c01,0


In [None]:
# Node2Vec + K-means
embeddings = get_node2vec_embeddings(part_graph)
embedding_matrix = np.array([embeddings[part] for part in part_graph.nodes])
part_nodes = list(part_graph.nodes)
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans_labels = kmeans.fit_predict(embedding_matrix)
kmeans_communities = pd.DataFrame({
    'part_num': part_nodes,
    'kmeans_cluster': kmeans_labels
})
print("\nK-means Clusters:")
print(kmeans_communities.head())

In [None]:
# Spectral Clustering
n_clusters = 10
spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
adj_matrix = nx.to_numpy_array(part_graph, weight='weight')
spectral_labels = spectral.fit_predict(adj_matrix)
spectral_communities = pd.DataFrame({
    'part_num': part_nodes,
    'spectral_cluster': spectral_labels
})
print("\nSpectral Clusters:")
print(spectral_communities.head())

In [None]:
# Step 4: Predict Parts for a Theme
def predict_parts_global(theme_id, set_parts, communities_df, method='louvain'):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    theme_parts = set_parts[set_parts['set_num'].isin(theme_sets)]['part_num'].unique()
    
    community_col = 'community' if method == 'louvain' else 'kmeans_cluster' if method == 'kmeans' else 'spectral_cluster'
    theme_communities = communities_df[communities_df['part_num'].isin(theme_parts)][community_col]
    
    top_communities = Counter(theme_communities).most_common(3)
    top_community_ids = [comm for comm, _ in top_communities]
    
    predicted_parts = communities_df[communities_df[community_col].isin(top_community_ids)]['part_num'].tolist()
    return predicted_parts

# Step 5: Evaluate Predictions
def evaluate_predictions(theme_id, set_parts, communities_df, method='louvain'):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    true_parts = set_parts[set_parts['set_num'].isin(theme_sets)]['part_num'].unique()
    predicted_parts = predict_parts_global(theme_id, set_parts, communities_df, method)
    
    all_parts = set_parts['part_num'].unique()
    y_true = [1 if part in true_parts else 0 for part in all_parts]
    y_pred = [1 if part in predicted_parts else 0 for part in all_parts]
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return {'precision': precision, 'recall': recall, 'f1': f1}

# Baseline: Most frequent parts
def baseline_predict(theme_id, set_parts, top_n=100):
    theme_sets = sets[sets['theme_id'] == theme_id]['set_num']
    theme_parts = set_parts[set_parts['set_num'].isin(theme_sets)]
    top_parts = theme_parts['part_num'].value_counts().head(top_n).index.tolist()
    return top_parts

In [None]:
# Step 6: Evaluate Global Approach
theme_id = 1  # Replace with valid theme_id
results = []

# Global Part Graph
global_communities = {
    'louvain': part_communities,
    'kmeans': kmeans_communities,
    'spectral': spectral_communities
}
for method in ['louvain', 'kmeans', 'spectral']:
    if not global_communities[method].empty:
        metrics = evaluate_predictions(theme_id, set_parts, global_communities[method], method)
        results.append({
            'Approach': 'Global',
            'Method': method.capitalize(),
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1': metrics['f1']
        })

# Baseline
baseline_parts = baseline_predict(theme_id, set_parts)
y_true = [1 if part in set_parts[set_parts['set_num'].isin(sets[sets['theme_id'] == theme_id]['set_num'])]['part_num'].unique() else 0 for part in set_parts['part_num'].unique()]
y_pred_baseline = [1 if part in baseline_parts else 0 for part in set_parts['part_num'].unique()]
baseline_metrics = {
    'precision': precision_score(y_true, y_pred_baseline, zero_division=0),
    'recall': recall_score(y_true, y_pred_baseline, zero_division=0),
    'f1': f1_score(y_true, y_pred_baseline, zero_division=0)
}
results.append({
    'Approach': 'Baseline',
    'Method': 'Most Frequent',
    'Precision': baseline_metrics['precision'],
    'Recall': baseline_metrics['recall'],
    'F1': baseline_metrics['f1']
})

# Display results
results_df = pd.DataFrame(results)
print("\nGlobal Approach Results:")
print(results_df.to_string(index=False, float_format="{:.3f}".format))