In [12]:
import pandas as pd
from data_reciever import datasets

In [23]:
def check_dataset(name, display_name=None):
    if display_name is None:
        display_name = name.replace('_', ' ').title()
    
    print(f"xxx {display_name.upper()} xxx")
    if name in datasets and datasets[name] is not None:
        print(datasets[name].head())
        print(f"Shape: {datasets[name].shape}")
    else:
        print("Dataset not loaded")
    print()

# Check all datasets with correct keys
check_dataset('menu_ingredient', 'MENU INGREDIENT')
check_dataset('date_seasons', 'SEASONS')  # Changed from 'seasons'
check_dataset('sales_history', 'SALES HISTORY')
check_dataset('date_event', 'EVENT')      # Changed from 'event'
check_dataset('date_weekend', 'WEEKEND')  # Changed from 'weekend'

xxx MENU INGREDIENT xxx
   item_id                  name     category  price  \
0        1  Paneer Butter Masala  Main Course    280   
1        2       Chicken Biryani  Main Course    320   
2        3        Veg Fried Rice  Main Course    200   
3        4             Dal Tadka  Main Course    180   
4        5           Butter Naan        Bread     50   

                                         ingredients  
0  [{'name': 'Paneer', 'quantity': '150g'}, {'nam...  
1  [{'name': 'Basmati Rice', 'quantity': '150g'},...  
2  [{'name': 'Rice', 'quantity': '150g'}, {'name'...  
3  [{'name': 'Toor Dal', 'quantity': '120g'}, {'n...  
4  [{'name': 'Flour', 'quantity': '80g'}, {'name'...  
Shape: (30, 5)

xxx SEASONS xxx
         date  season
0  2025-01-01  Winter
1  2025-01-02  Winter
2  2025-01-03  Winter
3  2025-01-04  Winter
4  2025-01-05  Winter
Shape: (365, 2)

xxx SALES HISTORY xxx
  Customer_ID        Date                                      Items_Ordered  \
0    CUST1018  2024-10-01 

In [35]:
print("xxx Date vs quantity xxx")
sales_df = datasets['sales_history']
seasons_df = datasets['date_seasons']
event_df = datasets['date_event']
weekend_df = datasets['date_weekend']

# Function to parse items and quantities
def parse_items(items_string):
    items = []
    for item in items_string.split('; '):
        if ' x' in item:
            name, quantity = item.rsplit(' x', 1)
            items.append({'dish': name.strip(), 'quantity': int(quantity)})
    return items

# Create a new structured DataFrame
structured_data = []

for _, row in sales_df.iterrows():
    items_list = parse_items(row['Items_Ordered'])
    for item in items_list:
        structured_data.append({
            'Date': row['Date'],
            'Customer_ID': row['Customer_ID'],
            'Dish': item['dish'],
            'Quantity': item['quantity']
        })

# Create the final DataFrame
date_itemquantity_df = pd.DataFrame(structured_data)

# Create sets of weekend dates and event dates for fast lookup
weekend_dates = set(weekend_df['date'].astype(str))
event_dates = set(event_df['date'].astype(str))

# Add Weekend column (1 if date is in weekend, else 0)
date_itemquantity_df['Weekend'] = date_itemquantity_df['Date'].isin(weekend_dates).astype(int)

# Add Event column (1 if date has event, else 0)
date_itemquantity_df['Event'] = date_itemquantity_df['Date'].isin(event_dates).astype(int)

print(date_itemquantity_df.head(250))
print(f"\nShape: {date_itemquantity_df.shape}")

xxx Date vs quantity xxx
           Date Customer_ID                  Dish  Quantity  Weekend  Event
0    2024-10-01    CUST1018             Egg Curry         1        0      0
1    2024-10-01    CUST1018              Fish Fry         1        0      0
2    2024-10-01    CUST1021          Caesar Salad         2        0      0
3    2024-10-01    CUST1021  Paneer Butter Masala         2        0      0
4    2024-10-01    CUST1021  Paneer Butter Masala         1        0      0
..          ...         ...                   ...       ...      ...    ...
245  2024-10-06    CUST1251                Mojito         1        0      0
246  2024-10-06    CUST1274          Caesar Salad         1        0      0
247  2024-10-06    CUST1274             Egg Curry         2        0      0
248  2024-10-06    CUST1297           Cold Coffee         1        0      0
249  2024-10-06    CUST1297         Pasta Alfredo         2        0      0

[250 rows x 6 columns]

Shape: (16856, 6)


In [38]:
print("xxx Date vs sales xxx")
sales_df = datasets['sales_history']

# More detailed version with order count and average order value
date_summary_df = sales_df.groupby('Date').agg({
    'Total_Cost_INR': 'sum',
    'Customer_ID': 'count'
}).reset_index()

date_summary_df = date_summary_df.rename(columns={
    'Total_Cost_INR': 'Sale',
    'Customer_ID': 'Number_of_Orders'
})

# Calculate average order value
date_summary_df['Average_Order_Value'] = date_summary_df['Sale'] / date_summary_df['Number_of_Orders']

# Sort by date
date_summary_df = date_summary_df.sort_values('Date')

print("Date-wise Sales Summary:")
print(date_summary_df.head(20))
print(f"\nShape: {date_summary_df.shape}")

xxx Date vs sales xxx
Date-wise Sales Summary:
          Date   Sale  Number_of_Orders  Average_Order_Value
0   2024-10-01   9118                15           607.866667
1   2024-10-02   7800                15           520.000000
2   2024-10-03   9100                16           568.750000
3   2024-10-04   7297                14           521.214286
4   2024-10-05  13227                20           661.350000
5   2024-10-06  11281                18           626.722222
6   2024-10-07  10325                15           688.333333
7   2024-10-08   8615                16           538.437500
8   2024-10-09   7623                16           476.437500
9   2024-10-10   7877                13           605.923077
10  2024-10-11   8735                13           671.923077
11  2024-10-12  14594                21           694.952381
12  2024-10-13  12868                20           643.400000
13  2024-10-14   8657                15           577.133333
14  2024-10-15   8304                1

In [40]:
date_itemquantity_df.to_csv('../ds/clean/date_itemquantity.csv', index=False)
date_summary_df.to_csv('../ds/clean/date_summary.csv', index=False)
print("Cleaned datasets saved to 'ds/clean/' folder")

Cleaned datasets saved to 'ds/clean/' folder
