# Data Visualizations

In [1]:
import pandas as pd
import plotly.graph_objects as go
import pandas as pd

In [2]:
df = pd.read_csv("data/train_features.csv")

In [3]:
# Prepare nodes
level1_nodes = df['puor_ds_level1_new'].dropna().unique().tolist()
level2_nodes = df['puor_ds_level2_new'].dropna().unique().tolist()
nodes = level1_nodes + level2_nodes
node_indices = {node: idx for idx, node in enumerate(nodes)}

# Create edges
edges = df[['puor_ds_level1_new', 'puor_ds_level2_new']].dropna().drop_duplicates()
sources = [node_indices[row[0]] for row in edges.values]
targets = [node_indices[row[1]] for row in edges.values]

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=[1]*len(sources)
    ))])

fig.update_layout(title_text="Category Hierarchy (Level 1 to Level 2)", font_size=10)
fig.show()


In [5]:
import pandas as pd
import plotly.graph_objects as go

# Group by Level1 → Level2 → Supplier and count PO lines
supplier_counts = df.groupby(['puor_ds_level1_new', 'puor_ds_level2_new', 'puor_id_lfa1_supplier']).size().reset_index(name='count')

# Get the top supplier per category pair
idx = supplier_counts.groupby(['puor_ds_level1_new', 'puor_ds_level2_new'])['count'].idxmax()
top_suppliers = supplier_counts.loc[idx].reset_index(drop=True)

# Prepare edge data
edge_data = top_suppliers.rename(columns={'count': 'po_lines', 'puor_id_lfa1_supplier': 'top_supplier'})
level1_nodes = df['puor_ds_level1_new'].dropna().unique().tolist()
level2_nodes = df['puor_ds_level2_new'].dropna().unique().tolist()
nodes = level1_nodes + [n for n in level2_nodes if n not in level1_nodes]
node_indices = {node: idx for idx, node in enumerate(nodes)}

# Sankey sources, targets, and values
sources = [node_indices[row[0]] for row in edge_data[['puor_ds_level1_new', 'puor_ds_level2_new']].values]
targets = [node_indices[row[1]] for row in edge_data[['puor_ds_level1_new', 'puor_ds_level2_new']].values]
values = edge_data['po_lines'].tolist()
labels = [
    f"{row['puor_ds_level1_new']} → {row['puor_ds_level2_new']}<br>"
    f"PO lines: {row['po_lines']}<br>"
    f"Top Supplier: {row['top_supplier']}"
    for _, row in edge_data.iterrows()
]

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        hovertemplate=labels
    ))])

fig.update_layout(title_text="Category Hierarchy (L1 → L2) with PO Count and Top Supplier", font_size=10)
fig.show()


In [7]:
import pandas as pd
import plotly.graph_objects as go

# Load your data
#  df = pd.read_csv("export_chatgpt.csv")

# Group by Level1 → Level2 → Supplier and count PO lines
supplier_counts = df.groupby(['puor_ds_level1_new', 'puor_ds_level2_new', 'puor_id_lfa1_supplier']).size().reset_index(name='count')

# Get the most frequent supplier per Level1 → Level2 pair
idx = supplier_counts.groupby(['puor_ds_level1_new', 'puor_ds_level2_new'])['count'].idxmax()
top_suppliers = supplier_counts.loc[idx].reset_index(drop=True)

# Prepare node list and indices
level1_nodes = df['puor_ds_level1_new'].dropna().unique().tolist()
level2_nodes = df['puor_ds_level2_new'].dropna().unique().tolist()
nodes = level1_nodes + [n for n in level2_nodes if n not in level1_nodes]
node_indices = {node: idx for idx, node in enumerate(nodes)}

# Build Sankey link components
sources = [node_indices[row['puor_ds_level1_new']] for _, row in top_suppliers.iterrows()]
targets = [node_indices[row['puor_ds_level2_new']] for _, row in top_suppliers.iterrows()]
values = top_suppliers['count'].tolist()
customdata = top_suppliers[['puor_ds_level1_new', 'puor_ds_level2_new', 'puor_id_lfa1_supplier', 'count']].values.tolist()

# Create Sankey diagram with custom hover info
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        customdata=customdata,
        hovertemplate=(
            "From: %{customdata[0]}<br>" +
            "To: %{customdata[1]}<br>" +
            "Top Supplier: %{customdata[2]}<br>" +
            "PO Lines: %{customdata[3]}<extra></extra>"
        )
    ))])

fig.update_layout(title_text="Category Hierarchy (L1 → L2) with PO Count and Top Supplier", font_size=10)
fig.show()


In [8]:
# Function to extract top supplier between two levels
def prepare_edges(df, level_from, level_to):
    group = df.groupby([level_from, level_to, 'puor_id_lfa1_supplier']).size().reset_index(name='count')
    idx = group.groupby([level_from, level_to])['count'].idxmax()
    top_suppliers = group.loc[idx].reset_index(drop=True)
    return top_suppliers.rename(columns={
        level_from: 'source_label',
        level_to: 'target_label',
        'puor_id_lfa1_supplier': 'top_supplier',
        'count': 'po_lines'
    })

# Prepare all edges for transitions: L1 → L2 → L3 → L4
edges_l1_l2 = prepare_edges(df, 'puor_ds_level1_new', 'puor_ds_level2_new')
edges_l2_l3 = prepare_edges(df, 'puor_ds_level2_new', 'puor_ds_level3_new')
edges_l3_l4 = prepare_edges(df, 'puor_ds_level3_new', 'puor_ds_level4_new')
all_edges = pd.concat([edges_l1_l2, edges_l2_l3, edges_l3_l4], ignore_index=True)

# Build list of unique nodes and map to index
nodes = pd.unique(all_edges[['source_label', 'target_label']].values.ravel('K')).tolist()
node_indices = {label: idx for idx, label in enumerate(nodes)}

# Build Sankey structure
sources = [node_indices[row['source_label']] for _, row in all_edges.iterrows()]
targets = [node_indices[row['target_label']] for _, row in all_edges.iterrows()]
values = all_edges['po_lines'].tolist()
customdata = all_edges[['source_label', 'target_label', 'top_supplier', 'po_lines']].values.tolist()

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        customdata=customdata,
        hovertemplate=(
            "From: %{customdata[0]}<br>" +
            "To: %{customdata[1]}<br>" +
            "Top Supplier: %{customdata[2]}<br>" +
            "PO Lines: %{customdata[3]}<extra></extra>"
        )
    ))])

fig.update_layout(title_text="Category Hierarchy (L1 → L4) with PO Count and Top Supplier", font_size=10)
fig.show()


In [18]:
# Sort nodes by level to help with layout clarity
all_edges['source_level'] = all_edges['source_label'].map(lambda x: df[
    (df['puor_ds_level1_new'] == x) | 
    (df['puor_ds_level2_new'] == x) | 
    (df['puor_ds_level3_new'] == x)
].shape[0])
# Rebuild node list with level order
ordered_nodes = []
for level in ['puor_ds_level1_new', 'puor_ds_level2_new', 'puor_ds_level3_new', 'puor_ds_level4_new']:
    ordered_nodes.extend(df[level].dropna().unique().tolist())
nodes_ordered_unique = list(dict.fromkeys(ordered_nodes))  # preserve order

# Reindex nodes based on level-order
node_indices_ordered = {label: idx for idx, label in enumerate(nodes_ordered_unique)}
sources = [node_indices_ordered[row['source_label']] for _, row in all_edges.iterrows()]
targets = [node_indices_ordered[row['target_label']] for _, row in all_edges.iterrows()]

# Final Sankey chart
fig = go.Figure(data=[go.Sankey(
    arrangement="fixed",  # prevent automatic reordering
    node=dict(
        pad=25,
        thickness=30,
        line=dict(color="black", width=0.5),
        label=nodes_ordered_unique,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        customdata=customdata,
        hovertemplate=(
            "<b>From:</b> %{customdata[0]}<br>" +
            "<b>To:</b> %{customdata[1]}<br>" +
            "<b>Top Supplier:</b> %{customdata[2]}<br>" +
            "<b>PO Lines:</b> %{customdata[3]}<extra></extra>"
        )
    ))])

fig.update_layout(
    title_text="Clean Category Hierarchy (L1 → L4) with Top Supplier",
    font_size=10,
    height=900,
    margin=dict(l=20, r=20, t=60, b=20)
)
fig.show()


In [15]:
# Re-run the full workflow to generate the colored Sankey chart based on supplier discriminative score

df_new = df.copy()

# Step 2: Create full category path
category_cols = ['puor_ds_level1_new', 'puor_ds_level2_new', 'puor_ds_level3_new', 'puor_ds_level4_new']
df_new['full_category_path'] = df_new[category_cols].apply(lambda x: ' > '.join(x.dropna()), axis=1)

# Step 3: Count unique category paths per supplier
supplier_category_counts = df_new.groupby('puor_id_lfa1_supplier')['full_category_path'].nunique().reset_index()
supplier_category_counts.columns = ['supplier', 'distinct_category_paths']

# Step 4: Merge back to main dataframe
df_highlight = df_new.merge(supplier_category_counts, left_on='puor_id_lfa1_supplier', right_on='supplier', how='left')

# Step 5: Define function to prepare edges with discriminative score
def prepare_colored_edges(df, level_from, level_to):
    group = df.groupby([level_from, level_to, 'puor_id_lfa1_supplier', 'distinct_category_paths']).size().reset_index(name='count')
    idx = group.groupby([level_from, level_to])['count'].idxmax()
    top_suppliers = group.loc[idx].reset_index(drop=True)
    return top_suppliers.rename(columns={
        level_from: 'source_label',
        level_to: 'target_label',
        'puor_id_lfa1_supplier': 'top_supplier',
        'distinct_category_paths': 'discriminative_score',
        'count': 'po_lines'
    })

# Step 6: Prepare edges
edges_l1_l2 = prepare_colored_edges(df_highlight, 'puor_ds_level1_new', 'puor_ds_level2_new')
edges_l2_l3 = prepare_colored_edges(df_highlight, 'puor_ds_level2_new', 'puor_ds_level3_new')
edges_l3_l4 = prepare_colored_edges(df_highlight, 'puor_ds_level3_new', 'puor_ds_level4_new')
all_edges = pd.concat([edges_l1_l2, edges_l2_l3, edges_l3_l4], ignore_index=True)

# Step 7: Build node index
nodes_ordered = pd.unique(all_edges[['source_label', 'target_label']].values.ravel('K')).tolist()
node_indices = {label: idx for idx, label in enumerate(nodes_ordered)}

# Step 8: Prepare Sankey inputs
sources = [node_indices[row['source_label']] for _, row in all_edges.iterrows()]
targets = [node_indices[row['target_label']] for _, row in all_edges.iterrows()]
values = all_edges['po_lines'].tolist()
customdata = all_edges[['source_label', 'target_label', 'top_supplier', 'po_lines', 'discriminative_score']].values.tolist()

# Normalize discriminative score for opacity
max_score = all_edges['discriminative_score'].max()
normalized_scores = [1 - (s / max_score) for s in all_edges['discriminative_score']]
link_colors = [f'rgba(0, 100, 200, {0.2 + 0.8 * s})' for s in normalized_scores]

# Step 9: Create and display the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    arrangement="fixed",
    node=dict(
        pad=25,
        thickness=30,
        line=dict(color="black", width=0.5),
        label=nodes_ordered,
        color="lightblue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors,
        customdata=customdata,
        hovertemplate=(
            "<b>From:</b> %{customdata[0]}<br>" +
            "<b>To:</b> %{customdata[1]}<br>" +
            "<b>Top Supplier:</b> %{customdata[2]}<br>" +
            "<b>PO Lines:</b> %{customdata[3]}<br>" +
            "<b>Discriminative Score:</b> %{customdata[4]}<extra></extra>"
        )
    ))])

fig.update_layout(
    title_text="Category Hierarchy (L1 → L4) Colored by Supplier Discriminative Score",
    font_size=10,
    height=900,
    margin=dict(l=20, r=20, t=60, b=20)
)
fig.show()
