In [4]:
import pandas as pd
import numpy as np
import ast
from collections import Counter


In [1]:
CLUSTER_RESULTS_FILE = 'ingredient_clusters.csv'
RECIPES_FILE = '../data/clean_data/recipes_cleaned.csv'
OUTPUT_CSV = 'cluster_characterization_automated.csv'
OUTPUT_LATEX = 'cluster_table.tex'

In [2]:
import pandas as pd
clusters_raw = pd.read_csv(CLUSTER_RESULTS_FILE)
print(f"  Cluster file columns: {clusters_raw.columns.tolist()}")
print(f"  First few rows:\n{clusters_raw.head()}")


  Cluster file columns: ['ingredient', 'cluster']
  First few rows:
             ingredient  cluster
0                   NaN       -1
1      a original sauce       -1
2         abalone steak       -1
3              absinthe        0
4  absolut kurant vodka        0


In [6]:
# Group ingredients by cluster
if 'cluster' in clusters_raw.columns and 'ingredient' in clusters_raw.columns:
    clusters_df = clusters_raw.groupby('cluster')['ingredient'].apply(list).reset_index()
    clusters_df.columns = ['cluster_id', 'ingredient_list']
else:
    raise ValueError("Expected columns 'cluster' and 'ingredient' in cluster results file")

print(f" Loaded {len(clusters_df)} clusters")

 Loaded 31 clusters


In [7]:
import ast
# Load original recipes
print("\n Loading recipe data...")
df_recipes = pd.read_csv(RECIPES_FILE)
# Parse ingredients column (adjust column name if different)
ingredient_col = 'ingredients'  # or 'ingredients_y' based on your file
if ingredient_col not in df_recipes.columns:
    # Try alternative column names
    for col in ['ingredients_y', 'ingredients_x', 'ingredient']:
        if col in df_recipes.columns:
            ingredient_col = col
            break

if isinstance(df_recipes[ingredient_col].iloc[0], str):
    df_recipes[ingredient_col] = df_recipes[ingredient_col].apply(ast.literal_eval)

print(f"Loaded {len(df_recipes)} recipes")


 Loading recipe data...
Loaded 222705 recipes


In [8]:
from collections import Counter

print("\n Computing ingredient frequencies...")
all_ingredients = []
for ing_list in df_recipes[ingredient_col]:
    all_ingredients.extend(ing_list)

ingredient_freq = Counter(all_ingredients)
print(f"  Processed {len(all_ingredients)} total ingredient occurrences")
print(f"  Found {len(ingredient_freq)} unique ingredients")


 Computing ingredient frequencies...
  Processed 2080428 total ingredient occurrences
  Found 14621 unique ingredients


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("\nComputing TF-IDF for distinctive terms...")

# Treat each cluster as a document, filter out non-string items
cluster_docs = [' '.join([ing for ing in ing_list if isinstance(ing, str)]) for ing_list in clusters_df['ingredient_list']]

# Compute TF-IDF
vectorizer = TfidfVectorizer(
    max_features=None,
    min_df=1,
    ngram_range=(1, 2),  # Allow bigrams
    token_pattern=r'[a-zA-Z]+'
)

tfidf_matrix = vectorizer.fit_transform(cluster_docs)
feature_names = vectorizer.get_feature_names_out()

print(f" TF-IDF computed ({len(feature_names)} terms)")


Computing TF-IDF for distinctive terms...
 TF-IDF computed (24276 terms)


In [10]:
import numpy as np

print("\n Characterizing clusters...")

results = []

for i, row in clusters_df.iterrows():
    cluster_id = row['cluster_id']
    ingredients = row['ingredient_list']
    
    # basic stats
    size = len(ingredients)

    # frequency analysis
    freqs = [ingredient_freq.get(ing, 0) for ing in ingredients]
    
    if len(freqs) == 0:
        continue  # Skip empty clusters
        
    avg_freq = np.mean(freqs)
    min_freq = np.min(freqs)
    max_freq = np.max(freqs)
    # Top 5 ingredients by frequency
    ing_with_freq = [(ing, ingredient_freq.get(ing, 0)) for ing in ingredients]
    ing_with_freq.sort(key=lambda x: x[1], reverse=True)
    top_5_by_freq = ', '.join([ing for ing, _ in ing_with_freq[:5]])
    
    # === TF-IDF DISTINCTIVE TERMS ===
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    top_tfidf_indices = tfidf_scores.argsort()[-5:][::-1]
    distinctive_terms = [
        feature_names[idx] 
        for idx in top_tfidf_indices 
        if tfidf_scores[idx] > 0
    ]
    distinctive_str = ', '.join(distinctive_terms[:5])
    
    results.append({
        'Cluster': cluster_id,
        'Size': size,
        'Distinctive_Terms': distinctive_str,
        'Top_5_Frequent': top_5_by_freq,
        'Avg_Freq': int(avg_freq),
        'Min_Freq': min_freq,
        'Max_Freq': max_freq
    })

results_df = pd.DataFrame(results)
# Sort by cluster size (descending)
results_df = results_df.sort_values('Size', ascending=False).reset_index(drop=True)

print(f"  Characterized {len(results_df)} clusters")


 Characterizing clusters...
  Characterized 31 clusters


In [11]:
%pip install Jinja2
print("\n" + "="*80)
print("CLUSTER CHARACTERIZATION RESULTS")
print("="*80)
print("\nTop 20 clusters by size:\n")

# Display with better formatting
display_df = results_df.head(20).copy()
display_df['Avg_Freq'] = display_df['Avg_Freq'].apply(lambda x: f"{x:,}")
print(display_df.to_string(index=False))

# Save to CSV
results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\n Results saved to: {OUTPUT_CSV}")

# Save to LaTeX (for report)
# Create a cleaner version for LaTeX
latex_df = results_df.head(20).copy()
latex_df.columns = ['ID', 'Size', 'Distinctive Terms', 'Most Frequent', 'Avg Freq', 'Min', 'Max']
latex_df.to_latex(OUTPUT_LATEX, index=False, escape=False, column_format='lllllll')
print(f" LaTeX table saved to: {OUTPUT_LATEX}")

Note: you may need to restart the kernel to use updated packages.

CLUSTER CHARACTERIZATION RESULTS

Top 20 clusters by size:

 Cluster  Size                                             Distinctive_Terms                                                                                                                                                                               Top_5_Frequent Avg_Freq  Min_Freq  Max_Freq
      -1  8179                             cheese, chicken, sauce, beef, mix                                                                                                                                                        salt, onion, olive oil, flour, pepper      132         0     86964
       0   820                    liqueur, chocolate, pudding, yogurt, vodka                                                                                                                                lemon juice, cream cheese, orange juice, cilantro, lime juice      156         

In [12]:
print(f"\nTotal clusters analyzed: {len(results_df)}")

print(f"\nCluster size distribution:")
print(f"  Min: {results_df['Size'].min()}")
print(f"  25th percentile: {results_df['Size'].quantile(0.25):.0f}")
print(f"  Median: {results_df['Size'].median():.0f}")
print(f"  75th percentile: {results_df['Size'].quantile(0.75):.0f}")
print(f"  Mean: {results_df['Size'].mean():.1f}")
print(f"  Max: {results_df['Size'].max()}")

print(f"\nAverage frequency distribution:")
print(f"  Min: {results_df['Avg_Freq'].min():,}")
print(f"  Median: {results_df['Avg_Freq'].median():,.0f}")
print(f"  Mean: {results_df['Avg_Freq'].mean():,.1f}")
print(f"  Max: {results_df['Avg_Freq'].max():,}")



Total clusters analyzed: 31

Cluster size distribution:
  Min: 9
  25th percentile: 12
  Median: 19
  75th percentile: 36
  Mean: 318.2
  Max: 8179

Average frequency distribution:
  Min: 0
  Median: 260
  Mean: 439.6
  Max: 3,281


In [13]:

# Largest clusters
print("\n Top 5 LARGEST clusters:")
largest = results_df.nlargest(5, 'Size')[['Cluster', 'Size', 'Distinctive_Terms', 'Top_5_Frequent']]
print(largest.to_string(index=False))

# Most frequent clusters (common ingredients)
print("\n Top 5 clusters with MOST FREQUENT ingredients:")
most_frequent = results_df.nlargest(5, 'Avg_Freq')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(most_frequent.to_string(index=False))

# Smallest clusters (niche/rare)
print("\n Top 5 SMALLEST clusters (niche ingredients):")
smallest = results_df.nsmallest(5, 'Size')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(smallest.to_string(index=False))

# Low-frequency clusters (rare ingredients)
print("\n Top 5 clusters with RAREST ingredients:")
rarest = results_df.nsmallest(5, 'Avg_Freq')[['Cluster', 'Size', 'Avg_Freq', 'Top_5_Frequent']]
print(rarest.to_string(index=False))


 Top 5 LARGEST clusters:
 Cluster  Size                          Distinctive_Terms                                                                 Top_5_Frequent
      -1  8179          cheese, chicken, sauce, beef, mix                                          salt, onion, olive oil, flour, pepper
       0   820 liqueur, chocolate, pudding, yogurt, vodka                  lemon juice, cream cheese, orange juice, cilantro, lime juice
       2   244         paste, chili, noodles, dal, powder                      soy sauce, sesame oil, sesame seeds, turmeric, peanut oil
       8    62         syrup, tea, drink mix, cherry, ice                                 sugar, water, frozen lemonade, pectin, cachaca
       5    60  pizza, sauce, pasta, italian, pizza crust mozzarella cheese, ricotta cheese, cottage cheese, provolone cheese, pepperoni

 Top 5 clusters with MOST FREQUENT ingredients:
 Cluster  Size  Avg_Freq                                                                               

In [14]:
# Compute intra-cluster frequency variance (cohesion indicator)
results_df['Freq_Std'] = 0
for i, row in clusters_df.iterrows():
    ingredients = row['ingredient_list']
    freqs = [ingredient_freq.get(ing, 0) for ing in ingredients]
    if len(freqs) > 1:
        freq_std = np.std(freqs)
        results_df.loc[results_df['Cluster'] == row['cluster_id'], 'Freq_Std'] = freq_std

# Clusters with low frequency variance = more cohesive
print("\n Most COHESIVE clusters (low frequency variance):")
cohesive = results_df.nsmallest(5, 'Freq_Std')[['Cluster', 'Size', 'Avg_Freq', 'Freq_Std', 'Distinctive_Terms']]
print(cohesive.to_string(index=False))



 Most COHESIVE clusters (low frequency variance):
 Cluster  Size  Avg_Freq   Freq_Std                                             Distinctive_Terms
      20    10         0   0.300000 turkey, muskellunge, patty muskellunge, original chik, torito
      29     9         1   1.099944                cookies, butter, cereal, tic tac, fruit rollup
       4    13         4  11.521526                    mozzarella, parmesan, potatoes, and, cream
      17    17        25  32.385438                    glutenfree, chex, corn, cereal, oat cereal
      27    12       110 127.410181                        ham, ham hock, split peas, split, hock


  results_df.loc[results_df['Cluster'] == row['cluster_id'], 'Freq_Std'] = freq_std
