In [2]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

In [3]:
np.random.seed(42)
def generate_coffee_data():
    n_records = 2000  # 2000 transactions over 6 months
    #Menue:
    menu_items = ['Espresso', 'Cappuccino', 'Latte', 'Americano', 'Mocha', 'Cold Brew']
    prices = [2.50, 4.00, 4.50, 3.00, 5.00, 3.50]
    # Getting dates over 6 months
    start_date = datetime(2023, 6, 1)
    dates = [start_date + timedelta(days=np.random.randint(0, 180))
            for _ in range(n_records)]
    data = {
        'date': dates,
        'drink': np.random.choice(menu_items, n_records),
        'price': [prices[menu_items.index(drink)] for drink in
                  np.random.choice(menu_items, n_records)],
        'quantity': np.random.choice([1, 1, 1, 2, 2, 3], n_records),  # Most people buy 1
        'customer_type': np.random.choice(['Regular', 'New', 'Tourist'],
                                        n_records, p=[0.5, 0.3, 0.2]),
        'payment_method': np.random.choice(['Card', 'Cash', 'Mobile'],
                                         n_records, p=[0.6, 0.2, 0.2]),
        'rating': np.random.choice([2,3, 4, 5], n_records, p=[0.1, 0.4, 0.4, 0.1])
    }
    return data
# Coffee shop DataFrame
coffee_data = generate_coffee_data()
df = pl.DataFrame(coffee_data)

In [4]:
print("First 5 transactions:")
print(df.head())

print("\nTypes of data?")
print(df.schema)

print("\nHow big is the dataset?")
print(f"We have {df.height} transactions and {df.width} columns")

First 5 transactions:
shape: (5, 7)
┌─────────────────────┬────────────┬───────┬──────────┬───────────────┬────────────────┬────────┐
│ date                ┆ drink      ┆ price ┆ quantity ┆ customer_type ┆ payment_method ┆ rating │
│ ---                 ┆ ---        ┆ ---   ┆ ---      ┆ ---           ┆ ---            ┆ ---    │
│ datetime[μs]        ┆ str        ┆ f64   ┆ i32      ┆ str           ┆ str            ┆ i32    │
╞═════════════════════╪════════════╪═══════╪══════════╪═══════════════╪════════════════╪════════╡
│ 2023-09-11 00:00:00 ┆ Cold Brew  ┆ 5.0   ┆ 1        ┆ New           ┆ Cash           ┆ 4      │
│ 2023-11-27 00:00:00 ┆ Cappuccino ┆ 4.5   ┆ 1        ┆ New           ┆ Card           ┆ 4      │
│ 2023-09-01 00:00:00 ┆ Espresso   ┆ 4.5   ┆ 1        ┆ Regular       ┆ Card           ┆ 3      │
│ 2023-06-15 00:00:00 ┆ Cappuccino ┆ 5.0   ┆ 1        ┆ New           ┆ Card           ┆ 4      │
│ 2023-09-15 00:00:00 ┆ Mocha      ┆ 5.0   ┆ 2        ┆ Regular       ┆ Card      

In [5]:
df_enhanced = df.with_columns([
    #Revenue per transaction
    (pl.col('price') * pl.col('quantity')).alias('total_sale'),

    #Useful date components
    pl.col('date').dt.weekday().alias('day_of_week'),
    pl.col('date').dt.month().alias('month'),
    pl.col('date').dt.hour().alias('hour_of_day')
])

print("Sample of enhanced data:")
print(df_enhanced.head())


Sample of enhanced data:
shape: (5, 11)
┌─────────────┬────────────┬───────┬──────────┬───┬────────────┬─────────────┬───────┬─────────────┐
│ date        ┆ drink      ┆ price ┆ quantity ┆ … ┆ total_sale ┆ day_of_week ┆ month ┆ hour_of_day │
│ ---         ┆ ---        ┆ ---   ┆ ---      ┆   ┆ ---        ┆ ---         ┆ ---   ┆ ---         │
│ datetime[μs ┆ str        ┆ f64   ┆ i32      ┆   ┆ f64        ┆ i8          ┆ i8    ┆ i8          │
│ ]           ┆            ┆       ┆          ┆   ┆            ┆             ┆       ┆             │
╞═════════════╪════════════╪═══════╪══════════╪═══╪════════════╪═════════════╪═══════╪═════════════╡
│ 2023-09-11  ┆ Cold Brew  ┆ 5.0   ┆ 1        ┆ … ┆ 5.0        ┆ 1           ┆ 9     ┆ 0           │
│ 00:00:00    ┆            ┆       ┆          ┆   ┆            ┆             ┆       ┆             │
│ 2023-11-27  ┆ Cappuccino ┆ 4.5   ┆ 1        ┆ … ┆ 4.5        ┆ 1           ┆ 11    ┆ 0           │
│ 00:00:00    ┆            ┆       ┆          ┆   ┆

In [6]:
drink_performance = (df_enhanced
    .group_by('drink')
    .agg([
        pl.col('total_sale').sum().alias('total_revenue'),
        pl.col('quantity').sum().alias('total_sold'),
        pl.col('rating').mean().alias('avg_rating')
    ])
    .sort('total_revenue', descending=True)
)

print("Drink performance ranking:")
print(drink_performance)

Drink performance ranking:
shape: (6, 4)
┌────────────┬───────────────┬────────────┬────────────┐
│ drink      ┆ total_revenue ┆ total_sold ┆ avg_rating │
│ ---        ┆ ---           ┆ ---        ┆ ---        │
│ str        ┆ f64           ┆ i32        ┆ f64        │
╞════════════╪═══════════════╪════════════╪════════════╡
│ Americano  ┆ 2242.0        ┆ 595        ┆ 3.476454   │
│ Mocha      ┆ 2204.0        ┆ 591        ┆ 3.492711   │
│ Espresso   ┆ 2119.5        ┆ 570        ┆ 3.514793   │
│ Cold Brew  ┆ 2035.5        ┆ 556        ┆ 3.475758   │
│ Cappuccino ┆ 1962.5        ┆ 521        ┆ 3.541139   │
│ Latte      ┆ 1949.5        ┆ 514        ┆ 3.528846   │
└────────────┴───────────────┴────────────┴────────────┘


In [7]:
daily_patterns = (df_enhanced
    .group_by('day_of_week')
    .agg([
        pl.col('total_sale').sum().alias('daily_revenue'),
        pl.len().alias('number_of_transactions')
    ])
    .sort('day_of_week')
)

print("Daily business patterns:")
print(daily_patterns)


Daily business patterns:
shape: (7, 3)
┌─────────────┬───────────────┬────────────────────────┐
│ day_of_week ┆ daily_revenue ┆ number_of_transactions │
│ ---         ┆ ---           ┆ ---                    │
│ i8          ┆ f64           ┆ u32                    │
╞═════════════╪═══════════════╪════════════════════════╡
│ 1           ┆ 2061.0        ┆ 324                    │
│ 2           ┆ 1761.0        ┆ 276                    │
│ 3           ┆ 1710.0        ┆ 278                    │
│ 4           ┆ 1784.0        ┆ 288                    │
│ 5           ┆ 1651.5        ┆ 265                    │
│ 6           ┆ 1596.0        ┆ 259                    │
│ 7           ┆ 1949.5        ┆ 310                    │
└─────────────┴───────────────┴────────────────────────┘
