# 04_lexicon_analytics.ipynb

Explore dish descriptions to surface information for building a lexicon (taste, aroma, texture)
without committing to rules yet. This notebook produces frequency tables, collocations,
co-occurrence matrices, TF-IDF terms, and KWIC samples.

In [None]:
from pathlib import Path
import pandas as pd, numpy as np
import re
from collections import Counter, defaultdict
from itertools import islice

ROOT = Path.cwd()
if ROOT.name == 'notebooks' and (ROOT.parent / 'data_cleaned').exists():
    ROOT = ROOT.parent
DATA_CLEANED = ROOT / 'data_cleaned'

PREFS = [
    DATA_CLEANED / 'user_orders_desc_clean.csv',
    DATA_CLEANED / 'user_orders_clean_with_description.csv',
    Path('/mnt/data/final.csv'),
]
INPUT = None
for p in PREFS:
    if p.exists():
        INPUT = p
        break
assert INPUT is not None, 'Could not find an input CSV.'
print('Using input:', INPUT)

df = pd.read_csv(INPUT)
col_candidates = [c for c in df.columns if c.lower().strip() == 'dish_description']
assert col_candidates, 'Input must contain a dish_description column.'
DESC_COL = col_candidates[0]
print('Description column =', DESC_COL, '| Rows =', len(df))

OUT_DIR = DATA_CLEANED
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
STOPWORDS = set('''a an and the of to in on with for by at from as is are be or that this those these it its into over under out up down if then than while when where which who whom whose also often usually typically commonly likely may might can could should would such just very more less most least light mild heavy extra hint touch rich loaded fresh classic style served'''.split())

def normalize(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def tokenize(s: str):
    t = normalize(s)
    toks = [w for w in t.split() if len(w) > 1 and w not in STOPWORDS]
    return toks

def ngrams(words, n):
    return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

docs = df[DESC_COL].fillna('').astype(str).tolist()
doc_tokens = [tokenize(s) for s in docs]

In [None]:
# Global frequencies
tok_freq = Counter()
bigram_freq = Counter()
trigram_freq = Counter()

for toks in doc_tokens:
    tok_freq.update(toks)
    bigram_freq.update(ngrams(toks, 2))
    trigram_freq.update(ngrams(toks, 3))

tokens_top = pd.DataFrame(tok_freq.most_common(500), columns=['token','count'])
bigrams_top = pd.DataFrame([(" ".join(k), v) for k, v in bigram_freq.most_common(300)], columns=['bigram','count'])
trigrams_top = pd.DataFrame([(" ".join(k), v) for k, v in trigram_freq.most_common(200)], columns=['trigram','count'])

tokens_top.to_csv(OUT_DIR / 'tokens_top.csv', index=False)
bigrams_top.to_csv(OUT_DIR / 'bigrams_top.csv', index=False)
trigrams_top.to_csv(OUT_DIR / 'trigrams_top.csv', index=False)
tokens_top.head(10)

In [None]:
# Collocations from patterns observed in your data
patterns = {
    'with_X': re.compile(r"\bwith\s+([a-z]+)\b"),
    'in_X_sauce': re.compile(r"\bin\s+([a-z]+)\s+sauce\b"),
    'fried_X': re.compile(r"\bfried\s+([a-z]+)\b"),
    'grilled_X': re.compile(r"\bgrilled\s+([a-z]+)\b"),
    'garlic_X': re.compile(r"\bgarlic\s+([a-z]+)\b"),
}
colloc = {k: Counter() for k in patterns}
for s in docs:
    s = s.lower()
    for name, pat in patterns.items():
        for m in pat.finditer(s):
            colloc[name][m.group(1)] += 1

for name, cnt in colloc.items():
    out = pd.DataFrame(cnt.most_common(200), columns=['x','count'])
    out.to_csv(OUT_DIR / f'collocations_{name}.csv', index=False)

pd.DataFrame(colloc['with_X'].most_common(20), columns=['x','count']).head(10)

In [None]:
# Cue Ã— Ingredient co-occurrence matrix (dataset-limited)
cue_words = {
    'taste': ['sweet','spicy','sour','salty','umami','bitter','tangy'],
    'aroma': ['garlic','garlicky','buttery','smoky','grilled','smoked','bbq','aioli'],
    'texture': ['crispy','crunchy','creamy','juicy','tender','soft','fried']
}
ingredients = ['cheese','cheddar','bacon','beef','chicken','mushroom','tomato','lettuce','pickles','onion','ketchup','mustard','ranch','gravy','mayo','aioli','bbq','bun','fries']

cue_list = sorted({w for cat in cue_words.values() for w in cat})
ing_list = sorted(set(ingredients))

cooc = pd.DataFrame(0, index=cue_list, columns=ing_list, dtype=int)

for toks in doc_tokens:
    ts = set(toks)
    cues_present = ts & set(cue_list)
    ings_present = ts & set(ing_list)
    for c in cues_present:
        for ig in ings_present:
            cooc.loc[c, ig] += 1

cooc.to_csv(OUT_DIR / 'cooccurrence_matrix.csv')
cooc.head(10)

In [None]:
# TF-IDF key terms per document (if scikit-learn available), exported for auditing
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    vec = TfidfVectorizer(
        tokenizer=lambda s: tokenize(s),
        preprocessor=lambda s: s,
        lowercase=False,
        min_df=2,
        max_df=0.9,
        ngram_range=(1,2)
    )
    X = vec.fit_transform(df[DESC_COL].fillna('').astype(str).tolist())
    terms = np.array(vec.get_feature_names_out())
    # For each doc: take top 8 terms
    tops = []
    for i in range(X.shape[0]):
        row = X.getrow(i)
        if row.nnz == 0:
            tops.append("")
            continue
        idx = row.indices
        vals = row.data
        order = np.argsort(-vals)[:8]
        tops.append(", ".join(terms[idx[order]]))
    tfidf_df = df.copy()
    tfidf_df['tfidf_top_terms'] = tops
    tfidf_df[['tfidf_top_terms']].to_csv(OUT_DIR / 'tfidf_terms_by_doc.csv', index=False)
    print('TF-IDF terms written.')
except Exception as e:
    print('Skipped TF-IDF (scikit-learn not available or failed):', e)

In [None]:
# KWIC (Key Word In Context) sampling utility
def kwic(samples, keyword, window=40, max_hits=30):
    out = []
    pat = re.compile(r"(.{0,%d}\b%s\b.{0,%d})" % (window, re.escape(keyword), window), re.I)
    for s in samples:
        for m in pat.finditer(s):
            out.append(m.group(1).replace('\n',' '))
            if len(out) >= max_hits:
                return out
    return out

# Generate a small KWIC file for common cues
cues_for_kwic = ['spicy','crispy','creamy','garlic','bbq','ranch','ketchup','mustard','gravy','aioli']
lines = []
for k in cues_for_kwic:
    hits = kwic(df[DESC_COL].fillna('').astype(str).tolist(), k, window=50, max_hits=20)
    lines.append(f'===== {k} =====')
    lines.extend(hits if hits else ['(no matches)'])
    lines.append('')
(OUT_DIR / 'kwic_samples.txt').write_text('\n'.join(lines))
print('KWIC written:', OUT_DIR / 'kwic_samples.txt')