In [1]:
# Imports
# -----------------------------
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
# Make plots look reasonable in notebooks
plt.rcParams.update({'figure.figsize': (10, 5), 'figure.dpi': 100})

In [3]:
# Config (edit these)
# -----------------------------
# Either set DATA_PATH to a local CSV file, or DATA_URL to a publicly reachable CSV URL.
DATA_PATH = 'food_service_data.csv' # <-- change this to your filename if needed
DATA_URL = None # e.g. 'https://raw.githubusercontent.com/your/repo/file.csv'
OUTPUT_DIR = 'eda_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [7]:
# Helper functions
# -----------------------------


def load_data(path=None, url=None):
  if url:
    df = pd.read_csv(url, parse_dates=['date'], dayfirst=False)
  elif path and Path(path).exists():
   df = pd.read_csv(path, parse_dates=['date'], dayfirst=False)
  else:
   raise FileNotFoundError('Please provide a valid DATA_PATH or DATA_URL')
  return df




def overview(df):
    print('\n=== Dataframe shape ===')
    print(df.shape)
    print('\n=== Head ===')
    print(df.head())
    print('\n=== Dtypes ===')
    print(df.dtypes)
    print('\n=== Missing values per column ===')
    print(df.isna().sum())


In [9]:
try:
    df = load_data(DATA_PATH if Path(DATA_PATH).exists() else None, DATA_URL)
except Exception as e:
    print('Error loading data:', e)
    # Create an empty example dataframe so subsequent cells won't crash while developing
    df = pd.DataFrame()



# If the user loaded a real dataframe, continue
if not df.empty:
        overview(df)

Error loading data: Please provide a valid DATA_PATH or DATA_URL


In [None]:
# Initial cleaning & type corrections
# -----------------------------
# Standardize column names (strip, lower)
df.columns = [c.strip() for c in df.columns]

In [10]:
# Ensure expected columns exist; provide helpful warnings if not
expected_cols = ['ID','date','meals_served','kitchen_staff','temperature_C',
'humidity_percent','day_of_week','special_event','past_waste_kg',
'staff_experience','waste_category']

In [12]:
# Attempt to map similar names if needed
col_map = {}
for col in df.columns:
    low = col.lower()
    if low in ['id', 'identifier']:
        col_map[col] = 'ID'
    elif 'date' in low:
        col_map[col] = 'date'
    elif 'meal' in low:
        col_map[col] = 'meals_served'
    elif 'kitchen' in low and 'staff' in low:
        col_map[col] = 'kitchen_staff'
    elif 'temperature' in low:
        col_map[col] = 'temperature_C'
    elif 'humid' in low:
        col_map[col] = 'humidity_percent'
    elif 'day' in low and 'week' in low:
        col_map[col] = 'day_of_week'
    elif 'special' in low and 'event' in low:
        col_map[col] = 'special_event'
    elif 'past' in low and 'waste' in low:
        col_map[col] = 'past_waste_kg'
    elif 'experience' in low or 'exp' in low:
        col_map[col] = 'staff_experience'
    elif 'waste_category' in low or ('waste' in low and 'cat' in low):
        col_map[col] = 'waste_category'

# Rename columns if any mapping exists
if col_map:
    df = df.rename(columns=col_map)


In [14]:
# Convert date
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')


# Numeric conversions
for num_col in ['meals_served','kitchen_staff','temperature_C','humidity_percent','past_waste_kg','day_of_week']:
    if num_col in df.columns:
        df[num_col] = pd.to_numeric(df[num_col], errors='coerce')


# Binary column corrections
if 'special_event' in df.columns:
    # map common variants
    df['special_event'] = df['special_event'].map({True:1, False:0, 'Yes':1, 'No':0, 'Y':1, 'N':0}).fillna(df['special_event'])
    df['special_event'] = pd.to_numeric(df['special_event'], errors='coerce').fillna(0).astype(int)


# Strip whitespace in categorical columns
for cat in ['staff_experience','waste_category']:
    if cat in df.columns:
        df[cat] = df[cat].astype(str).str.strip().replace({'nan': np.nan})


In [15]:
# Missing values handling strategy (logged)
# -----------------------------
missing_summary = df.isna().sum()
print('\nMissing summary:\n', missing_summary)


Missing summary:
 Series([], dtype: float64)


In [18]:
# Drop rows missing date or meals_served (if they are very few)
if 'date' in df.columns and 'meals_served' in df.columns:
    n_before = df.shape[0]
    df = df.dropna(subset=['date', 'meals_served'])
    print(f'Dropped {n_before - df.shape[0]} rows missing date or meals_served')


In [19]:
# For numeric fields, impute median if missing but keep a flag
for c in ['temperature_C','humidity_percent','past_waste_kg','kitchen_staff']:
    if c in df.columns:
        miss = df[c].isna().sum()
        if miss > 0:
            df[c + '_missing_flag'] = df[c].isna().astype(int)
            df[c] = df[c].fillna(df[c].median())
            print(f'Imputed {miss} missing values in {c} with median')


In [20]:
# For categorical columns
for c in ['staff_experience','waste_category']:
    if c in df.columns:
        df[c] = df[c].fillna('Unknown')


In [21]:
# Duplicates
# -----------------------------
n_dup = df.duplicated().sum()
print(f'Found {n_dup} duplicate rows')
if n_dup > 0:
    df = df.drop_duplicates()
    print('Dropped duplicate rows')

# -----------------------------


Found 0 duplicate rows


In [22]:
# Feature engineering
# -----------------------------
if 'date' in df.columns:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday_name'] = df['date'].dt.day_name()


In [23]:
# Create staff_level categories (low/medium/high) based on quantiles
if 'kitchen_staff' in df.columns:
    quantiles = df['kitchen_staff'].quantile([0.33, 0.66]).values

    def staff_level(x):
        if x <= quantiles[0]:
            return 'low'
        elif x <= quantiles[1]:
            return 'medium'
        else:
            return 'high'

    df['staff_level'] = df['kitchen_staff'].apply(staff_level)


In [26]:
# Summary statistics
# -----------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if numeric_cols:  # only proceed if we have numeric columns
    print('\nNumeric summary:')
    display_df = df[numeric_cols].describe().T
    print(display_df)
    display_df.to_csv(os.path.join(OUTPUT_DIR, 'numeric_summary.csv'))
else:
    print("No numeric columns found in the dataframe.")


No numeric columns found in the dataframe.


In [27]:
# Visualizations
# -----------------------------
# Histograms for key numerical columns
plot_cols = ['meals_served','temperature_C','humidity_percent','past_waste_kg']
for c in plot_cols:
    if c in df.columns:
        plt.figure()
        sns.histplot(df[c].dropna(), kde=True)
        plt.title(f'Distribution of {c}')
        plt.xlabel(c)
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'hist_{c}.png'))
        plt.close()


In [28]:
# Boxplots to detect outliers
for c in ['meals_served','past_waste_kg']:
    if c in df.columns:
        plt.figure()
        sns.boxplot(x=df[c].dropna())
        plt.title(f'Boxplot of {c}')
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'box_{c}.png'))
        plt.close()


In [29]:
# Bar plots for categorical variables
for c in ['staff_experience','waste_category','staff_level']:
    if c in df.columns:
        plt.figure(figsize=(10,4))
        order = df[c].value_counts().index
        sns.countplot(x=c, data=df, order=order)
        plt.title(f'Counts by {c}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'bar_{c}.png'))
        plt.close()


In [31]:
# Correlation heatmap
import seaborn as sns
import matplotlib.pyplot as plt

# Keep only numeric columns with at least one non-NaN value
numeric_cols_nonan = [c for c in numeric_cols if df[c].dropna().shape[0] > 0]

if numeric_cols_nonan:
    plt.figure(figsize=(8,6))
    corr = df[numeric_cols_nonan].corr()
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_heatmap.png'))
    plt.close()
else:
    print("No valid numeric columns available for correlation heatmap.")


No valid numeric columns available for correlation heatmap.


In [32]:
# Scatterplots to inspect relationships (e.g., meals_served vs past_waste_kg)
if 'meals_served' in df.columns and 'past_waste_kg' in df.columns:
    plt.figure()
    sns.scatterplot(x='meals_served', y='past_waste_kg', data=df)
    plt.title('Meals served vs Past waste (kg)')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'scatter_meals_vs_waste.png'))
    plt.close()


In [33]:
# Correlation insights (basic)
# -----------------------------
if 'past_waste_kg' in df.columns and 'corr' in globals():
    cor_with_waste = corr['past_waste_kg'].sort_values(ascending=False)
    print('\nCorrelation of numeric variables with past_waste_kg:')
    print(cor_with_waste)


You can install Python libraries using `pip install` in a code cell:

In [None]:
!pip install <library_name>

In [34]:
# Hypothesis testing
# -----------------------------
results = {}

# 1) Special events (t-test)
if 'special_event' in df.columns and 'past_waste_kg' in df.columns:
    ev = df[df['special_event'] == 1]['past_waste_kg']
    non = df[df['special_event'] == 0]['past_waste_kg']
    # check normality roughly via sample size; use Welch's t-test
    tstat, pval = stats.ttest_ind(ev.dropna(), non.dropna(), equal_var=False)
    results['special_event_ttest'] = {
        'tstat': float(tstat),
        'pvalue': float(pval),
        'mean_event': float(ev.mean()),
        'mean_non_event': float(non.mean())
    }
    print('\nSpecial event t-test: t={:.3f}, p={:.3f}'.format(tstat, pval))

# 2) Kitchen staff levels (ANOVA)
if 'staff_level' in df.columns and 'past_waste_kg' in df.columns:
    groups = [g['past_waste_kg'].values for n, g in df.groupby('staff_level')]
    fstat, pval = stats.f_oneway(*groups)
    results['staff_level_anova'] = {'fstat': float(fstat), 'pvalue': float(pval)}
    print('\nANOVA across staff_level: F={:.3f}, p={:.3f}'.format(fstat, pval))

# Save hypothesis results
pd.DataFrame.from_dict(results, orient='index').to_csv(
    os.path.join(OUTPUT_DIR, 'hypothesis_results.csv')
)

# -----------------------------
# Grouped summaries for reporting
# -----------------------------
if 'staff_level' in df.columns:
    grp = df.groupby('staff_level')['past_waste_kg'].agg(['mean', 'median', 'count']).reset_index()
    grp.to_csv(os.path.join(OUTPUT_DIR, 'waste_by_staff_level.csv'), index=False)

if 'special_event' in df.columns:
    ev_grp = df.groupby('special_event')['past_waste_kg'].agg(['mean', 'median', 'count']).reset_index()
    ev_grp.to_csv(os.path.join(OUTPUT_DIR, 'waste_by_special_event.csv'), index=False)



In [35]:
# Recommendations generator (basic)
# -----------------------------
recs = []

# Example rules
if 'kitchen_staff' in df.columns and 'past_waste_kg' in df.columns:
    # If higher staff levels produce lower waste on average -> suggest more staff during peaks
    means = df.groupby('staff_level')['past_waste_kg'].mean()
    if 'low' in means.index and 'high' in means.index and means['high'] < means['low']:
        recs.append(
            'Higher staff levels are associated with lower waste — consider increasing staff during busy days'
        )

if 'special_event' in df.columns and 'past_waste_kg' in df.columns:
    ev_mean = df[df['special_event'] == 1]['past_waste_kg'].mean()
    non_mean = df[df['special_event'] == 0]['past_waste_kg'].mean()
    if ev_mean > non_mean:
        recs.append(
            'Special events show higher waste on average — implement event-specific portion control and donation planning'
        )

# Save recommendations
with open(os.path.join(OUTPUT_DIR, 'recommendations.txt'), 'w') as f:
    if recs:
        f.write('\n'.join(recs))
    else:
        f.write(
            'No automated recommendations generated — please inspect visuals and correlations for context.'
        )

print('\nEDA complete. Outputs saved in folder:', OUTPUT_DIR)
print('Key recommendations written to', os.path.join(OUTPUT_DIR, 'recommendations.txt'))

# -----------------------------
# END
# -----------------------------



EDA complete. Outputs saved in folder: eda_outputs
Key recommendations written to eda_outputs/recommendations.txt
