# Store Layout Optimization — Configured for `Groceries data.csv`

This notebook automatically detects the dataset schema and configures the Market Basket Analysis (MBA), co-purchase network, feature engineering, and uplift-simulation pipeline specifically for your `Groceries data.csv` file. Run cells sequentially. Requirements: pandas, numpy, mlxtend, xgboost, networkx, matplotlib.

In [None]:
# Imports
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import networkx as nx
from mlxtend.frequent_patterns import apriori, association_rules
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
pd.options.display.max_columns = 200
print('Imports OK')

In [None]:
# Load dataset
file_path = Path('/mnt/data/Groceries data.csv')
if not file_path.exists():
    raise FileNotFoundError('Upload Groceries data.csv to /mnt/data/')
df = pd.read_csv(file_path)
print('Loaded rows, cols:', df.shape)
display(df.head(10))
display(pd.DataFrame({'column': df.columns, 'dtype': [str(df[c].dtype) for c in df.columns]}))

In [None]:
# Auto-detect likely column roles (heuristic)
cols = [c.lower() for c in df.columns]
mappings = {}
# transaction-like
for candidate in ['transactionid','transaction_id','invoice','invoice_no','basketid','basket_id']:
    if candidate in cols:
        mappings['TransactionID'] = df.columns[cols.index(candidate)]
        break
# date
for candidate in ['date','transactiondate','sale_date','tdate']:
    if candidate in cols:
        mappings['Date'] = df.columns[cols.index(candidate)]
        break
# item id / name / sku
for candidate in ['itemid','item_id','productid','product_id','sku','item','product']:
    if candidate in cols:
        mappings['ItemID'] = df.columns[cols.index(candidate)]
        break
for candidate in ['itemname','productname','description','item_name','product_name']:
    if candidate in cols:
        mappings['ItemName'] = df.columns[cols.index(candidate)]
        break
# qty sold
for candidate in ['qty','qty_sold','quantity','quantity_sold','units','units_sold','sales']:
    if candidate in cols:
        mappings['QtySold'] = df.columns[cols.index(candidate)]
        break
# price / unit price
for candidate in ['price','unitprice','unit_price','mrp']:
    if candidate in cols:
        mappings['Price'] = df.columns[cols.index(candidate)]
        break
# store / outlet
for candidate in ['storeid','store_id','outlet','outlet_id','store']:
    if candidate in cols:
        mappings['StoreID'] = df.columns[cols.index(candidate)]
        break
print('Detected mappings:')
import json
print(json.dumps(mappings, indent=2))

In [None]:
# Quick schema-based decisions
is_transactional = 'TransactionID' in mappings or ('Date' in mappings and 'ItemID' in mappings and 'QtySold' in mappings)
print('Transactional-like:', is_transactional)


In [None]:
# Normalize column names into standard names used in this notebook
rename_map = {}
for k,v in mappings.items():
    rename_map[v] = k
df = df.rename(columns=rename_map)
display(df.head(10))

In [None]:
# Basic cleaning: parse dates, ensure numeric types
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
for c in ['QtySold','Price']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
display(df.dtypes)

In [None]:
# Prepare basket matrix for MBA
if is_transactional:
    if 'TransactionID' not in df.columns:
        # attempt to create BasketID by grouping Date+StoreID if available
        if 'StoreID' in df.columns and 'Date' in df.columns:
            df['TransactionID'] = df['StoreID'].astype(str) + '|' + pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
        else:
            df['TransactionID'] = pd.factorize(df.index)[0]  # fallback: unique row baskets
    basket = df.groupby(['TransactionID','ItemID'])['QtySold'].sum().unstack(fill_value=0)
    basket_bin = basket.applymap(lambda x: 1 if x>0 else 0)
    print('Basket matrix created:', basket_bin.shape)
else:
    basket_bin = None
    print('Dataset is not transactional or lacks required columns - MBA cannot be run automatically.')

In [None]:
# Run Market Basket Analysis (apriori) if possible
if basket_bin is not None and basket_bin.shape[0]>0:
    frequent_itemsets = apriori(basket_bin, min_support=0.01, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
    rules = rules.sort_values(['lift','support'], ascending=[False, False]).reset_index(drop=True)
    print('Found rules:', len(rules))
    display(rules.head(30))
else:
    print('Skipping MBA - insufficient basket data.')

In [None]:
# Build co-purchase network (top pairs)
if 'rules' in globals() and not rules.empty:
    pairs = rules[(rules['antecedents'].apply(len)==1) & (rules['consequents'].apply(len)==1)].copy()
    pairs['ant'] = pairs['antecedents'].apply(lambda s: list(s)[0])
    pairs['cons'] = pairs['consequents'].apply(lambda s: list(s)[0])
    top_pairs = pairs.sort_values('lift', ascending=False).head(100)
    G = nx.Graph()
    for _,r in top_pairs.iterrows():
        G.add_edge(r['ant'], r['cons'], weight=r['lift'])
    plt.figure(figsize=(10,8))
    pos = nx.spring_layout(G, seed=42)
    weights=[G[u][v]['weight'] for u,v in G.edges()]
    nx.draw(G, pos, with_labels=True, node_size=300, width=[w*0.8 for w in weights])
    plt.title('Co-purchase network (top pairs by lift)')
    plt.show()
else:
    print('No rules to build network.')

In [None]:
# Feature engineering for demand modeling (daily aggregation and lags)
agg_cols = ['ItemID']
if 'StoreID' in df.columns:
    agg_cols = ['StoreID','ItemID']

if 'Date' in df.columns and 'QtySold' in df.columns:
    df_daily = df.groupby(agg_cols + [pd.Grouper(key='Date')])['QtySold'].sum().reset_index()
    df_daily = df_daily.rename(columns={'Date':'DateOnly'})
    df_daily = df_daily.sort_values(agg_cols + ['DateOnly'])
    for lag in [1,7,14]:
        df_daily[f'lag_{lag}'] = df_daily.groupby(agg_cols)['QtySold'].shift(lag)
    for window in [7,14,30]:
        df_daily[f'rolling_mean_{window}'] = df_daily.groupby(agg_cols)['QtySold'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    display(df_daily.head())    
else:
    df_daily = None
    print('Insufficient Date/QtySold for time-series features')

In [None]:
# Example model training (XGBoost) to predict QtySold — only if enough data
if df_daily is not None:
    df_model = df_daily.dropna(subset=['rolling_mean_7']).copy()
    # Ensure placeholder placement/visibility features exist
    if 'Placement' not in df.columns:
        df['Placement'] = 'baseline'
    # Merge placement into df_model if possible (left join)
    if 'Placement' in df.columns:
        df_model = df_model.merge(df[['ItemID','Placement']].drop_duplicates(), on='ItemID', how='left')
    df_model['Placement_cat'] = df_model['Placement'].astype('category').cat.codes if 'Placement' in df_model.columns else 0
    feature_cols = [c for c in df_model.columns if c.startswith('lag_') or c.startswith('rolling_mean_')] + ['Placement_cat']
    df_model = df_model.dropna(subset=feature_cols + ['QtySold'])
    split_date = df_model['DateOnly'].max() - pd.Timedelta(days=7)
    train = df_model[df_model['DateOnly'] <= split_date]
    test = df_model[df_model['DateOnly'] > split_date]
    if len(train) > 50 and len(test) > 0:
        X_train = train[feature_cols]; y_train = train['QtySold']
        X_test = test[feature_cols]; y_test = test['QtySold']
        model = xgb.XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print('RMSE:', mean_squared_error(y_test, preds, squared=False))
        print('R2:', r2_score(y_test, preds))
        fi = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
        display(fi.head(20))
    else:
        print('Not enough rows to train model (need >50 training rows).')
else:
    print('No daily data for modeling.')

In [None]:
# Uplift simulation helper (requires trained model and df_model)
def simulate_placement_uplift(item_id, store_id=None, model=None, df_model=None, bump_placement=1):
    if model is None or df_model is None:
        raise ValueError('Provide trained model and df_model')
    subset = df_model.copy()
    if store_id is not None and 'StoreID' in subset.columns:
        subset = subset[(subset['ItemID']==item_id) & (subset['StoreID']==store_id)]
    else:
        subset = subset[subset['ItemID']==item_id]
    if subset.empty:
        return None
    latest = subset.sort_values('DateOnly').iloc[-1:].copy()
    X = latest[ [c for c in df_model.columns if c.startswith('lag_') or c.startswith('rolling_mean_')] + ['Placement_cat'] ].copy()
    X_new = X.copy()
    if 'Placement_cat' in X_new.index or 'Placement_cat' in X_new.columns:
        # increase placement_cat code by bump (simulate moving to more visible slot)
        X_new = X_new.copy()
        X_new['Placement_cat'] = X_new.get('Placement_cat', 0) + bump_placement
    pred_base = model.predict(X.values.reshape(1,-1))[0]
    pred_new = model.predict(X_new.values.reshape(1,-1))[0]
    uplift_pct = (pred_new - pred_base) / pred_base * 100 if pred_base != 0 else np.nan
    return dict(item_id=item_id, store_id=store_id, pred_base=float(pred_base), pred_new=float(pred_new), uplift_pct=float(uplift_pct))

print('Helper defined. Use simulate_placement_uplift(item_id, store_id, model, df_model)')

## Next steps

- Run this notebook end-to-end. It will auto-detect schema and run MBA if transactional baskets exist. If the dataset is aggregated, focus on demand modeling and placement simulation using provided features.
- If MBA is skipped, consider transforming POS transactions into basket-level rows (TransactionID + ItemID) as input.

**If you want, I can now run this notebook code on your dataset and produce the results (MBA rules, co-purchase network, model training).** Do you want me to execute the analysis now and return the outputs?