In [28]:
import pandas as pd
import numpy as np
import ast
from collections import Counter


In [29]:
CLUSTER_RESULTS_FILE = 'ingredient_clusters_hdbscan.csv'
RECIPES_FILE = '../data/clean_data/recipes_cleaned.csv'
OUTPUT_CSV = 'cluster_characterization_automated_hdbscan.csv'
OUTPUT_LATEX = 'cluster_table_hdbscan.tex'

In [30]:
import pandas as pd
clusters_raw = pd.read_csv(CLUSTER_RESULTS_FILE)
print(f"  Cluster file columns: {clusters_raw.columns.tolist()}")
print(f"  First few rows:\n{clusters_raw.head()}")


  Cluster file columns: ['ingredient', 'cluster']
  First few rows:
             ingredient  cluster
0                   NaN       -1
1      a original sauce       -1
2         abalone steak       -1
3              absinthe       -1
4  absolut kurant vodka       39


In [31]:
# Group ingredients by cluster
if 'cluster' in clusters_raw.columns and 'ingredient' in clusters_raw.columns:
    clusters_df = clusters_raw.groupby('cluster')['ingredient'].apply(list).reset_index()
    clusters_df.columns = ['cluster_id', 'ingredient_list']
else:
    raise ValueError("Expected columns 'cluster' and 'ingredient' in cluster results file")

print(f" Loaded {len(clusters_df)} clusters")

 Loaded 42 clusters


In [32]:
import ast
# Load original recipes
print("\n Loading recipe data...")
df_recipes = pd.read_csv(RECIPES_FILE)
# Parse ingredients column (adjust column name if different)
ingredient_col = 'ingredients'  # or 'ingredients_y' based on your file
if ingredient_col not in df_recipes.columns:
    # Try alternative column names
    for col in ['ingredients_y', 'ingredients_x', 'ingredient']:
        if col in df_recipes.columns:
            ingredient_col = col
            break

if isinstance(df_recipes[ingredient_col].iloc[0], str):
    df_recipes[ingredient_col] = df_recipes[ingredient_col].apply(ast.literal_eval)

print(f"Loaded {len(df_recipes)} recipes")


 Loading recipe data...
Loaded 222705 recipes


In [33]:
from collections import Counter

print("\n Computing ingredient frequencies...")
all_ingredients = []
for ing_list in df_recipes[ingredient_col]:
    all_ingredients.extend(ing_list)

ingredient_freq = Counter(all_ingredients)
print(f"  Processed {len(all_ingredients)} total ingredient occurrences")
print(f"  Found {len(ingredient_freq)} unique ingredients")


 Computing ingredient frequencies...
  Processed 2080428 total ingredient occurrences
  Found 14621 unique ingredients


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("\nComputing TF-IDF for distinctive terms...")

# Treat each cluster as a document, filter out non-string items
cluster_docs = [' '.join([ing for ing in ing_list if isinstance(ing, str)]) for ing_list in clusters_df['ingredient_list']]

# Compute TF-IDF
vectorizer = TfidfVectorizer(
    max_features=None,
    min_df=1,
    ngram_range=(1, 2),  # Allow bigrams
    token_pattern=r'[a-zA-Z]+'
)

tfidf_matrix = vectorizer.fit_transform(cluster_docs)
feature_names = vectorizer.get_feature_names_out()

print(f" TF-IDF computed ({len(feature_names)} terms)")


Computing TF-IDF for distinctive terms...
 TF-IDF computed (24267 terms)


In [35]:
import numpy as np

print("\n Characterizing clusters...")

results = []

for i, row in clusters_df.iterrows():
    cluster_id = row['cluster_id']
    ingredients = row['ingredient_list']
    
    # basic stats
    size = len(ingredients)

    # frequency analysis
    freqs = [ingredient_freq.get(ing, 0) for ing in ingredients]
    
    if len(freqs) == 0:
        continue  # Skip empty clusters
        
    avg_freq = np.mean(freqs)
    min_freq = np.min(freqs)
    max_freq = np.max(freqs)
    # Top 5 ingredients by frequency
    ing_with_freq = [(ing, ingredient_freq.get(ing, 0)) for ing in ingredients]
    ing_with_freq.sort(key=lambda x: x[1], reverse=True)
    top_5_by_freq = ', '.join([ing for ing, _ in ing_with_freq[:5]])
    
    # === TF-IDF DISTINCTIVE TERMS ===
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    top_tfidf_indices = tfidf_scores.argsort()[-5:][::-1]
    distinctive_terms = [
        feature_names[idx] 
        for idx in top_tfidf_indices 
        if tfidf_scores[idx] > 0
    ]
    distinctive_str = ', '.join(distinctive_terms[:5])
    
    results.append({
        'Cluster': cluster_id,
        'Size': size,
        'Distinctive_Terms': distinctive_str,
        'Top_5_Frequent': top_5_by_freq,
        'Avg_Freq': int(avg_freq),
        'Min_Freq': min_freq,
        'Max_Freq': max_freq
    })

results_df = pd.DataFrame(results)
# Sort by cluster size (descending)
results_df = results_df.sort_values('Size', ascending=False).reset_index(drop=True)

print(f"  Characterized {len(results_df)} clusters")


 Characterizing clusters...
  Characterized 42 clusters


In [36]:
print("\nTop 20 clusters by size:\n")

# Display with better formatting
display_df = results_df.head(20).copy()
display_df['Avg_Freq'] = display_df['Avg_Freq'].apply(lambda x: f"{x:,}")
print(display_df.to_string(index=False))

# Save to CSV
results_df.to_csv(OUTPUT_CSV, index=False)

# Save to LaTeX (for report)
# Create a cleaner version for LaTeX
latex_df = results_df.head(20).copy()
latex_df.columns = ['ID', 'Size', 'Distinctive Terms', 'Most Frequent', 'Avg Freq', 'Min', 'Max']
latex_df.to_latex(OUTPUT_LATEX, index=False, escape=False, column_format='lllllll')



Top 20 clusters by size:

 Cluster  Size                                Distinctive_Terms                                                                                     Top_5_Frequent Avg_Freq  Min_Freq  Max_Freq
      -1  8797               cheese, chicken, sauce, mix, dried                                                              salt, butter, sugar, onion, olive oil      137         0     86964
      39   184            liqueur, juice, yogurt, vodka, orange                                                 orange juice, lime juice, banana, orange zest, ice      109         0      4490
      15    90             pasta, pizza, sauce, italian, tomato                    parmesan cheese, mozzarella cheese, ricotta cheese, crushed tomatoes, spaghetti      454         0     15182
      34    62               tortilla, corn, beans, chile, taco                                salsa, monterey jack cheese, jalapeno pepper, jalapeno, frozen corn      279         0      2764
      30    5

In [37]:
print(f"\nTotal clusters analyzed: {len(results_df)}")

print(f"\nCluster size distribution:")
print(f"  Min: {results_df['Size'].min()}")
print(f"  25th percentile: {results_df['Size'].quantile(0.25):.0f}")
print(f"  Median: {results_df['Size'].median():.0f}")
print(f"  75th percentile: {results_df['Size'].quantile(0.75):.0f}")
print(f"  Mean: {results_df['Size'].mean():.1f}")
print(f"  Max: {results_df['Size'].max()}")

print(f"\nAverage frequency distribution:")
print(f"  Min: {results_df['Avg_Freq'].min():,}")
print(f"  Median: {results_df['Avg_Freq'].median():,.0f}")
print(f"  Mean: {results_df['Avg_Freq'].mean():,.1f}")
print(f"  Max: {results_df['Avg_Freq'].max():,}")



Total clusters analyzed: 42

Cluster size distribution:
  Min: 10
  25th percentile: 13
  Median: 17
  75th percentile: 26
  Mean: 234.9
  Max: 8797

Average frequency distribution:
  Min: 0
  Median: 280
  Mean: 451.0
  Max: 2,713


In [38]:

# Largest clusters
print("\n Top 5 LARGEST clusters:")
largest = results_df.nlargest(5, 'Size')[['Cluster', 'Size', 'Distinctive_Terms', 'Top_5_Frequent']]
print(largest.to_string(index=False))

# Most frequent clusters (common ingredients)
print("\n Top 5 clusters with MOST FREQUENT ingredients:")
most_frequent = results_df.nlargest(5, 'Avg_Freq')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(most_frequent.to_string(index=False))

# Smallest clusters (niche/rare)
print("\n Top 5 SMALLEST clusters (niche ingredients):")
smallest = results_df.nsmallest(5, 'Size')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(smallest.to_string(index=False))

# Low-frequency clusters (rare ingredients)
print("\n Top 5 clusters with RAREST ingredients:")
rarest = results_df.nsmallest(5, 'Avg_Freq')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(rarest.to_string(index=False))


 Top 5 LARGEST clusters:
 Cluster  Size                      Distinctive_Terms                                                                  Top_5_Frequent
      -1  8797     cheese, chicken, sauce, mix, dried                                           salt, butter, sugar, onion, olive oil
      39   184  liqueur, juice, yogurt, vodka, orange                              orange juice, lime juice, banana, orange zest, ice
      15    90   pasta, pizza, sauce, italian, tomato parmesan cheese, mozzarella cheese, ricotta cheese, crushed tomatoes, spaghetti
      34    62     tortilla, corn, beans, chile, taco             salsa, monterey jack cheese, jalapeno pepper, jalapeno, frozen corn
      30    53 paste, sauce, noodles, chinese, sesame           sesame oil, rice vinegar, fish sauce, hoisin sauce, rice wine vinegar

 Top 5 clusters with MOST FREQUENT ingredients:
 Cluster  Size  Avg_Freq                                                                                                 

In [39]:
# Compute intra-cluster frequency variance (cohesion indicator)
results_df['Freq_Std'] = 0
for i, row in clusters_df.iterrows():
    ingredients = row['ingredient_list']
    freqs = [ingredient_freq.get(ing, 0) for ing in ingredients]
    if len(freqs) > 1:
        freq_std = np.std(freqs)
        results_df.loc[results_df['Cluster'] == row['cluster_id'], 'Freq_Std'] = freq_std

# Clusters with low frequency variance = more cohesive
print("\n Most COHESIVE clusters (low frequency variance):")
cohesive = results_df.nsmallest(5, 'Freq_Std')[['Cluster', 'Size', 'Avg_Freq', 'Freq_Std', 'Distinctive_Terms']]
print(cohesive.to_string(index=False))



 Most COHESIVE clusters (low frequency variance):
 Cluster  Size  Avg_Freq  Freq_Std                                                     Distinctive_Terms
       2    13         0  0.421325  turkey, torito adobo, turkey slices, lightlife turkey, chunky turkey
      37    11         0  0.445362 cups, cream, shortcake, yogurtcovered pretzels, topping fruitflavored
      32    23         0  0.624030                                          wine, soda, cherry, ice, tea
      24    16         1  1.089725                     caramel, milk chocolate, candy, chocolate, cereal
       8    16         4 10.638961                          chicken, parmesan, mozzarella, potatoes, and


  results_df.loc[results_df['Cluster'] == row['cluster_id'], 'Freq_Std'] = freq_std


In [69]:
# Load the cluster results
clusters_df = pd.read_csv('cluster_characterization_automated_hdbscan.csv')

# Manual interpretation based on results
cluster_labels = {
    -1: "Universal Ingredients (Noise)",
    
    # Excellent Quality Clusters (Clear semantic meaning)
    39: "Cocktails & Fruit Beverages",
    15: "Italian Cuisine (Pasta & Pizza)", 
    30: "Asian Cuisine (Chinese & Thai)",
    34: "Mexican Cuisine (Tex-Mex)",
    22: "Bread Making (Yeast Baking)",
    25: "Healthy Baking (Whole Grains)",
    28: "Box Mixes (Convenience Baking)",
    29: "Indian Cuisine (Spices & Dal)",
    
    # Good Quality Clusters (Meaningful patterns)
    12: "Seafood Salads & Sandwiches",
    13: "BBQ & Ribs (Pork Dishes)",
    40: "Creamy Cocktails & Liqueurs",
    38: "Chocolate Baking (Cocoa)",
    26: "Dessert Add-ins (Chips & Candy)",
    20: "Ground Spices (Baking)",
    31: "Ice Cream & Sweet Syrups",
    
    # Fair Quality Clusters (Detectable patterns)
    10: "Baking Mixes (Bisquick)",
    14: "Fresh Salad Greens",
    5: "Light Baking (Reduced Fat)",
    16: "Beef Dishes (Chuck & Roast)",
    6: "Pie & Pastry Components",
    17: "Tea & Herbal Ingredients",
    24: "Candy & Cereal Treats",
    
    # Poor Quality Clusters (Weak patterns or data issues)
    32: "Novelty Beverages & Wine",
    37: "Miscellaneous Dessert Items",
    2: "Ultra-Rare/Data Quality Issues",
    8: "Prepared/Convenience Foods",
    27: "Novelty/Party Foods (Data Issues)",
    21: "Snacks & Dried Fruits",
    0: "Low-Fat Alternatives & Soups",
    3: "Summer Squash & Vegetables",
    18: "Diet/Reduced Products",
    23: "Soup Ingredients (Split Peas & Barley)",
    36: "Fish & Citrus",
    33: "Flavored Cereals & Schnapps (Data Issues)",
    19: "Specialty Powders & Creams",
    35: "Cream Cheese Desserts",
    7: "Bread & Dressing (Mixed)",
    9: "Bacon & Savory Items (Data Issues)",
    11: "Maple & Cinnamon Baking",
    4: "International Biscuits & Whiskey",
    1: "Specialty Spreads & Meats"
}

clusters_df['Category_Label'] = clusters_df['Cluster'].map(cluster_labels)

# Define interpretability score
def interpretability_score(row):
    if row['Cluster'] == -1:
        return 0  # Noise cluster
    
    # Normalize size
    size_score = min(row['Size'] / 100, 1.0)
    
    # Compute coherence as inverse of frequency spread
    if row['Avg_Freq'] > 0:
        spread = (row['Max_Freq'] - row['Min_Freq']) / (row['Avg_Freq'] + 1)
        coherence_score = 1 / (1 + spread)
    else:
        coherence_score = 0.5  # fallback for missing data
    
    return (size_score + coherence_score) / 2

# Apply and sort
clusters_df['Interpretability_Score'] = clusters_df.apply(interpretability_score, axis=1)
clusters_ranked = clusters_df.sort_values('Interpretability_Score', ascending=False)

# Show results
print("\n=== CLUSTERS RANKED BY INTERPRETABILITY ===")
print(clusters_ranked[['Cluster', 'Category_Label', 'Size', 
                        'Interpretability_Score', 'Distinctive_Terms']].head(15))

# Save to file
clusters_ranked.to_csv('clusters_with_labels_hdbscan.csv', index=False)



=== CLUSTERS RANKED BY INTERPRETABILITY ===
    Cluster                   Category_Label  Size  Interpretability_Score  \
1        39      Cocktails & Fruit Beverages   184                0.511957   
2        15  Italian Cuisine (Pasta & Pizza)    90                0.464549   
13       32         Novelty Beverages & Wine    23                0.365000   
3        34        Mexican Cuisine (Tex-Mex)    62                0.355992   
31        2   Ultra-Rare/Data Quality Issues    13                0.315000   
4        30   Asian Cuisine (Chinese & Thai)    53                0.307454   
35       37      Miscellaneous Dessert Items    11                0.305000   
8        29    Indian Cuisine (Spices & Dal)    32                0.246395   
21       24            Candy & Cereal Treats    16                0.222857   
6        40      Creamy Cocktails & Liqueurs    38                0.219570   
5        12      Seafood Salads & Sandwiches    38                0.209800   
7        13        

In [70]:
# =============================================================================
# STEP 6: Quality Tier Classification
# ============================================================================

print("\n" + "="*80)
print("CLUSTER QUALITY TIER DISTRIBUTION")
print("="*80)

# Classify all non-noise clusters
valid_clusters = clusters_df[clusters_df['Cluster'] != -1].copy()

def classify_quality_tier(row):
    score = row['Interpretability_Score']
    if score > 0.7:
        return "Excellent"
    elif score > 0.5:
        return "Good"
    elif score > 0.3:
        return "Fair"
    else:
        return "Poor"

valid_clusters['Quality_Tier'] = valid_clusters.apply(classify_quality_tier, axis=1)

# Summary by tier
quality_summary = valid_clusters.groupby('Quality_Tier').agg({
    'Cluster': 'count',
    'Size': 'sum',
    'Interpretability_Score': 'mean',
    'Avg_Freq': 'mean'
}).rename(columns={
    'Cluster': 'N_Clusters', 
    'Size': 'Total_Ingredients',
    'Interpretability_Score': 'Avg_Interpretability',
    'Avg_Freq': 'Avg_Ingredient_Freq'
})

# Order by quality
tier_order = ['Excellent', 'Good', 'Fair', 'Poor']
quality_summary = quality_summary.reindex([t for t in tier_order if t in quality_summary.index])

print(quality_summary)

# Show example clusters for each tier
print("\n" + "="*80)
print("EXAMPLE CLUSTERS BY QUALITY TIER")
print("="*80)

for tier in tier_order:
    tier_clusters = valid_clusters[valid_clusters['Quality_Tier'] == tier]
    if len(tier_clusters) > 0:
        print(f"\n{tier.upper()} ({len(tier_clusters)} clusters):")
        examples = tier_clusters.nlargest(3, 'Size')
        for _, row in examples.iterrows():
            print(f"  • Cluster {row['Cluster']:2d}: {row['Category_Label']:<35} "
                  f"(n={row['Size']:3d}, score={row['Interpretability_Score']:.3f})")

# Save
valid_clusters.to_csv('clusters_with_quality_tiers.csv', index=False)

# ============================================================================
# STEP 7: Create Summary Statistics Table
# ============================================================================

print("\n" + "="*80)
print("FINAL SUMMARY STATISTICS")
print("="*80)

summary_stats = pd.DataFrame({
    'Metric': [
        'Total Ingredients',
        'Clustered Ingredients',
        'Noise Ingredients',
        'Non-noise Clusters',
        'Clusters (Excellent)',
        'Clusters (Good)',
        'Clusters (Fair)',
        'Clusters (Poor)',
        'Median Cluster Size',
        'Mean Cluster Size',
        'Largest Cluster (non-noise)',
        'Smallest Cluster (non-noise)'
    ],
    'Value': [
        len(ingredient_clusters),
        len(ingredient_clusters[ingredient_clusters['cluster'] != -1]),
        len(ingredient_clusters[ingredient_clusters['cluster'] == -1]),
        len(valid_clusters),
        len(valid_clusters[valid_clusters['Quality_Tier'] == 'Excellent']),
        len(valid_clusters[valid_clusters['Quality_Tier'] == 'Good']),
        len(valid_clusters[valid_clusters['Quality_Tier'] == 'Fair']),
        len(valid_clusters[valid_clusters['Quality_Tier'] == 'Poor']),
        f"{valid_clusters['Size'].median():.0f}",
        f"{valid_clusters['Size'].mean():.1f}",
        valid_clusters['Size'].max(),
        valid_clusters['Size'].min()
    ]
})

print(summary_stats.to_string(index=False))
summary_stats.to_csv('clustering_summary_statistics.csv', index=False)

print("\n" + "="*80)
print(" ANALYSIS COMPLETE!")
print("="*80)
print("\nGenerated files:")
print("  1. clusters_with_labels_dbscan.csv          - All clusters with semantic labels")
print("  2. cluster_quality_detailed.csv      - Deep analysis of showcase clusters")
print("  3. clusters_with_quality_tiers.csv   - All clusters with quality classification")
print("  4. clustering_summary_statistics.csv - Overall statistics")
print("\nNext steps:")
print("  • Create visualizations with this data")
print("  • Write report sections using these insights")
print("  • Compare with HDBSCAN results")


CLUSTER QUALITY TIER DISTRIBUTION
              N_Clusters  Total_Ingredients  Avg_Interpretability  \
Quality_Tier                                                        
Good                   1                184              0.511957   
Fair                   6                252              0.352166   
Poor                  34                631              0.143701   

              Avg_Ingredient_Freq  
Quality_Tier                       
Good                   109.000000  
Fair                   169.166667  
Poor                   520.029412  

EXAMPLE CLUSTERS BY QUALITY TIER

GOOD (1 clusters):
  • Cluster 39: Cocktails & Fruit Beverages         (n=184, score=0.512)

FAIR (6 clusters):
  • Cluster 15: Italian Cuisine (Pasta & Pizza)     (n= 90, score=0.465)
  • Cluster 34: Mexican Cuisine (Tex-Mex)           (n= 62, score=0.356)
  • Cluster 30: Asian Cuisine (Chinese & Thai)      (n= 53, score=0.307)

POOR (34 clusters):
  • Cluster 12: Seafood Salads & Sandwiches         