<a href="https://colab.research.google.com/github/Noelle-Pastor/Predictive-Demand-Forecasting-for-Foodbanks/blob/main/Generating_synthetic_demand_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Generating Synthetic Food Bank Demand Data**
### Layering in multipliers for season, time of month and time of week.

##### **Fields Generated:**  Date | Client_ID | Household_Size | Pounds_Distributed | Zip Code

In [None]:
import pandas as pd
import numpy as np
import random
import datetime
import uuid

##Set Parameters

In [None]:
START_DATE = "2020-01-01"
END_DATE = "2025-09-01"
BASE_AVG_VISITS = 80
GROWTH_RATE = 0.05

# Probability distributions for household size and zip code
HOUSEHOLD_SIZE_DIST = {1 : 0.15, 2 : 0.15, 3 : 0.20, 4 : 0.25, 5 : 0.18, 6 : 0.04, 7 : 0.02, 8 : 0.01}
DALLAS_ZIP_DIST = {"75217": 0.20, "75227": 0.15, "75211": 0.12, "75243": 0.10, "75228": 0.08, "Other": 0.35}

In [None]:
#Get list with every date between START_DATE and END_DATE, inclusive
DATE_RANGE = pd.to_datetime(pd.date_range(START_DATE, END_DATE))

NUM_DAYS = len(DATE_RANGE)

##Generate Daily Visit Counts

In [None]:
#Increasing demand over time
visits_growth_trend = np.linspace(0, NUM_DAYS*(GROWTH_RATE/365), NUM_DAYS)

In [None]:
daily_visit_counts = []

for i, date in enumerate(DATE_RANGE):

  #Get base # of visits for the day accounting for increase in demand over time
  base = BASE_AVG_VISITS * (1 + visits_growth_trend[i])

  #Yearly seaonality using sin and cos waves; seaonsal increases during summer and holidays
  seasonal_multiplier = (1 +
    0.17 * np.sin(2 * np.pi * date.dayofyear/365.25 - np.pi/2) + #summer
    0.12 * np.cos(2 * np.pi * date.dayofyear/365.25)             #holidays
    )

  #Monthly 20%-50% spike food banks see in the last 10 days of the month
  monthly_multiplier = np.random.normal(1.35, .15) if date.day > 20 else 1.0

  #Weekly spike on Fri/Sat, dips on Sunday bc closed
  if date.weekday() in [4, 5]: #Friday, Saurday
    weekly_multiplier = np.random.normal(1.35, .1)
  elif date.weekday() == 6: #Sunday assuming closed
    weekly_multiplier = 0.0
  else:
    weekly_multiplier = 1.0

  #Combine multipliers
  num_visits = base * seasonal_multiplier * monthly_multiplier * weekly_multiplier

  #Random "noise" +- 8%
  num_visits *= np.random.normal(1, .08)

  daily_visit_counts.append(round(num_visits))


##Generate a Visit Log

In [None]:
all_visits = []

for date, visits in zip(DATE_RANGE, daily_visit_counts):

  #generate all records for day
  for i in range(visits):

    #Generate ClientID
    client_id = str(uuid.uuid4()) #not considering new or returning client. Client ID won't be used in analysis.

    #Generate Household_Size using probability distribution
    household_size = np.random.choice(
            list(HOUSEHOLD_SIZE_DIST.keys()),
            p = list(HOUSEHOLD_SIZE_DIST.values()))

    zip_code = np.random.choice(
                list(DALLAS_ZIP_DIST.keys()),
                p = list(DALLAS_ZIP_DIST.values()))

    pounds_distributed = round(household_size * 12.5 + np.random.normal(0, 5))

    all_visits.append({
           "Visit_Date" : date.date(),
           "Client_ID" : client_id,
           "Household_Size" : household_size,
           "Zip_Code" : zip_code,
           "Pounds_Distributed" : pounds_distributed})

##Create Dataframe from **all_visits** list

In [None]:
df_visits = pd.DataFrame(all_visits)

In [None]:
df_visits.head()

Unnamed: 0,Visit_Date,Client_ID,Household_Size,Zip_Code,Pounds_Distributed
0,2020-01-01,7ccd97f5-9b0d-4f98-8577-64ea0e5914f1,5,75227,65
1,2020-01-01,06c8d9f1-a19e-4320-a093-ceff246d8c53,3,Other,32
2,2020-01-01,2ed3b4d1-66ae-4afb-82bf-fa467f85ba31,2,75228,24
3,2020-01-01,9b5e8848-55ce-4a3f-b0ac-85a5946f4a0e,4,75217,52
4,2020-01-01,8b0e3036-4cf8-47a9-9496-70ffe8c84dbe,5,75228,64


## Save as a CSV file

In [None]:
df_visits.to_csv("Synthetic_Food_Bank_Visits.csv", index=False)