In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
import numpy as np
def generate_synthetic_data(n=1000):


    # Set a seed for reproducibility
    np.random.seed(42)

    # Dates ranging over a year
    dates = pd.date_range(start='2022-01-01', periods=n)

    # Product categories
    categories = ['Electronics', 'Furniture', 'Grocery', 'Clothing']
    product_categories = np.random.choice(categories, n)

    # Products and SKUs
    products = [f'Product_{i}' for i in range(1, 51)]  # 50 products
    product_list = np.random.choice(products, n)
    SKUs = [f'SKU_{i}' for i in range(1, 51)]  # 50 SKUs
    sku_list = np.random.choice(SKUs, n)

    # Prices normally distributed and positive
    prices = np.abs(np.random.normal(loc=100, scale=20, size=n))

    # Quantity sold - we assume that it's a Poisson distribution
    quantity = np.random.poisson(lam=10, size=n)

    # Competitor's price - usually it would be around the actual product price
    competitor_prices = prices + np.random.normal(loc=0, scale=10, size=n)

    # Discounts
    discounts = np.random.choice([0, 5, 10, 15, 20], size=n)  # in percentage

    # Create a dataframe
    df = pd.DataFrame({
        'date': dates,
        'product': product_list,
        'SKU': sku_list,
        'category': product_categories,
        'price': prices,
        'quantity': quantity,
        'competitor_price': competitor_prices,
        'discount': discounts
    })

    # Black Friday dates
    df['black_friday_date'] = pd.to_datetime(['2022-11-25' if date.month >= 11 else '2023-11-24' for date in df['date']])
    

    #df.to_csv('synthetic_data.csv', index=False)
    print(f"Synthetic data of size {n} generated and saved to 'synthetic_data.csv'")

    return df

# Call the function with a custom size



In [3]:
df = generate_synthetic_data(n=5000)

Synthetic data of size 5000 generated and saved to 'synthetic_data.csv'


In [5]:
df.head()


Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date
0,2022-01-01,Product_47,SKU_37,Grocery,132.432952,12,156.235766,20,2023-11-24
1,2022-01-02,Product_47,SKU_36,Clothing,132.68282,7,133.821708,15,2023-11-24
2,2022-01-03,Product_4,SKU_36,Electronics,98.879707,8,98.53318,10,2023-11-24
3,2022-01-04,Product_7,SKU_4,Grocery,105.06017,8,105.503424,20,2023-11-24
4,2022-01-05,Product_45,SKU_36,Grocery,87.767968,10,90.016462,5,2023-11-24


In [4]:
import sys
sys.path.append('../src')

In [49]:
from data.feature_engineering import FeatureEngineeringProcess


In [50]:
#date_fe = FeatureEngineeringProcess.datetime_transform
feature_engineering = FeatureEngineeringProcess()
df = feature_engineering.datetime_transform(df, date_feature='date', 
                                            features=['month', 'day', 'day_name', 'week', 'year',
                                                       'quarter', 'season','holidays'],
                                                       )

updated ___


In [51]:
df.head()

Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date,date_month,date_day,date_day_name,date_week,date_year,date_quarter,date_season,date_holidays
0,2022-01-01,Product_47,SKU_37,Grocery,132.432952,12,156.235766,20,2023-11-24,1,1,Saturday,52,2022,1,Winter,1
1,2022-01-02,Product_47,SKU_36,Clothing,132.68282,7,133.821708,15,2023-11-24,1,2,Sunday,52,2022,1,Winter,0
2,2022-01-03,Product_4,SKU_36,Electronics,98.879707,8,98.53318,10,2023-11-24,1,3,Monday,1,2022,1,Winter,0
3,2022-01-04,Product_7,SKU_4,Grocery,105.06017,8,105.503424,20,2023-11-24,1,4,Tuesday,1,2022,1,Winter,0
4,2022-01-05,Product_45,SKU_36,Grocery,87.767968,10,90.016462,5,2023-11-24,1,5,Wednesday,1,2022,1,Winter,0


In [52]:
df.date_holidays.value_counts()

0    4822
1     178
Name: date_holidays, dtype: int64

In [26]:
from datetime import date
import holidays

us_holidays = holidays.US()  # this is a dict
# the below is the same, but takes a string:
us_holidays = holidays.country_holidays('US')  # this is a dict



date(2015, 1, 1) in us_holidays  # True
date(2015, 1, 2) in us_holidays  # False
us_holidays.get('2014-01-01')  # "New Year's Day"

"New Year's Day"

In [46]:
holidays.CountryHoliday('PL', years=2022).keys()

dict_keys([datetime.date(2022, 1, 1), datetime.date(2022, 1, 6), datetime.date(2022, 4, 17), datetime.date(2022, 4, 18), datetime.date(2022, 5, 1), datetime.date(2022, 5, 3), datetime.date(2022, 6, 5), datetime.date(2022, 6, 16), datetime.date(2022, 8, 15), datetime.date(2022, 11, 1), datetime.date(2022, 11, 11), datetime.date(2022, 12, 25), datetime.date(2022, 12, 26)])