In [1]:
%load_ext autoreload
%autoreload 2


In [17]:
import pandas as pd
import numpy as np
def generate_synthetic_data(n=1000):


    # Set a seed for reproducibility
    np.random.seed(42)

    # Dates ranging over a year
    dates = pd.date_range(start='2022-01-01', periods=n)

    # Product categories
    categories = ['Electronics', 'Furniture', 'Grocery', 'Clothing']
    product_categories = np.random.choice(categories, n)

    # Products and SKUs
    products = [f'Product_{i}' for i in range(1, 51)]  # 50 products
    product_list = np.random.choice(products, n)
    SKUs = [f'SKU_{i}' for i in range(1, 51)]  # 50 SKUs
    sku_list = np.random.choice(SKUs, n)

    # Prices normally distributed and positive
    prices = np.abs(np.random.normal(loc=100, scale=20, size=n))

    # Quantity sold - we assume that it's a Poisson distribution
    quantity = np.random.poisson(lam=10, size=n)

    # Competitor's price - usually it would be around the actual product price
    competitor_prices = prices + np.random.normal(loc=0, scale=10, size=n)

    # Discounts
    discounts = np.random.choice([0, 5, 10, 15, 20], size=n)  # in percentage

    # Create a dataframe
    df = pd.DataFrame({
        'date': dates,
        'product': product_list,
        'SKU': sku_list,
        'category': product_categories,
        'price': prices,
        'quantity': quantity,
        'competitor_price': competitor_prices,
        'discount': discounts
    })

    # Black Friday dates
    df['black_friday_date'] = pd.to_datetime(['2022-11-25' if date.month >= 11 else '2023-11-24' for date in df['date']])

    # Additional fields for feature engineering
    df['views'] = np.random.randint(0, 100, size=n)
    df['clicks'] = np.random.randint(0, 50, size=n)
    df['favorites'] = np.random.randint(0, 20, size=n)
    df['buys'] = np.random.randint(0, 10, size=n)
    

    #df.to_csv('synthetic_data.csv', index=False)
    print(f"Synthetic data of size {n} generated and saved to 'synthetic_data.csv'")

    return df

# Call the function with a custom size



In [22]:
df = generate_synthetic_data(n=500)

Synthetic data of size 500 generated and saved to 'synthetic_data.csv'


In [4]:
df.head()


Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date,views,clicks,favorites,buys
0,2022-01-01,Product_21,SKU_34,Grocery,127.347405,15,126.257644,0,2023-11-24,57,29,6,7
1,2022-01-02,Product_32,SKU_6,Clothing,119.165077,5,129.504709,5,2023-11-24,93,27,7,1
2,2022-01-03,Product_23,SKU_2,Electronics,104.545228,9,98.015648,20,2023-11-24,40,16,6,3
3,2022-01-04,Product_33,SKU_13,Grocery,114.959609,5,127.093423,10,2023-11-24,24,2,13,2
4,2022-01-05,Product_3,SKU_43,Grocery,120.102043,11,120.733836,10,2023-11-24,77,15,18,1


In [18]:
import sys
sys.path.append('../src')

In [39]:
from data.feature_engineering import FeatureEngineeringProcess


In [7]:
#date_fe = FeatureEngineeringProcess.datetime_transform
feature_engineering = FeatureEngineeringProcess()
df = feature_engineering.datetime_transform(df, date_feature='date', 
                                            features=['month', 'day', 'day_name', 'week', 'year',
                                                       'quarter', 'season','holidays'],
                                                       )

updated ___


In [None]:
df.head()

In [None]:
df.date_holidays.value_counts()

In [None]:
from datetime import date
import holidays

us_holidays = holidays.US()  # this is a dict
# the below is the same, but takes a string:
us_holidays = holidays.country_holidays('US')  # this is a dict



date(2015, 1, 1) in us_holidays  # True
date(2015, 1, 2) in us_holidays  # False
us_holidays.get('2014-01-01')  # "New Year's Day"

In [None]:
holidays.CountryHoliday('PL', years=2022).keys()

In [None]:
print(df.head())

In [40]:
feature_engineering = FeatureEngineeringProcess()

In [24]:
df.head()

Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date,views,clicks,favorites,buys
0,2022-01-01,Product_21,SKU_34,Grocery,127.347405,15,126.257644,0,2023-11-24,57,29,6,7
1,2022-01-02,Product_32,SKU_6,Clothing,119.165077,5,129.504709,5,2023-11-24,93,27,7,1
2,2022-01-03,Product_23,SKU_2,Electronics,104.545228,9,98.015648,20,2023-11-24,40,16,6,3
3,2022-01-04,Product_33,SKU_13,Grocery,114.959609,5,127.093423,10,2023-11-24,24,2,13,2
4,2022-01-05,Product_3,SKU_43,Grocery,120.102043,11,120.733836,10,2023-11-24,77,15,18,1


In [37]:
df.head()

Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date,views,clicks,favorites,buys,quantity_mean,quantity_min,quantity_max,views_mean,views_min,views_max
0,2022-01-01,Product_21,SKU_34,Grocery,127.347405,15,126.257644,0,2023-11-24,57,29,6,7,40.25,1,69,40.25,1,69
1,2022-01-02,Product_32,SKU_6,Clothing,119.165077,5,129.504709,5,2023-11-24,93,27,7,1,37.428571,4,95,37.428571,4,95
2,2022-01-03,Product_23,SKU_2,Electronics,104.545228,9,98.015648,20,2023-11-24,40,16,6,3,46.0,1,94,46.0,1,94
3,2022-01-04,Product_33,SKU_13,Grocery,114.959609,5,127.093423,10,2023-11-24,24,2,13,2,50.428571,3,95,50.428571,3,95
4,2022-01-05,Product_3,SKU_43,Grocery,120.102043,11,120.733836,10,2023-11-24,77,15,18,1,68.272727,12,99,68.272727,12,99


In [41]:
feature_engineering.grouped_feature_eng(df, group_features=['SKU'], features=['quantity', 'views'])


Unnamed: 0,date,product,SKU,category,price,quantity,competitor_price,discount,black_friday_date,views,...,quantity_max,views_mean,views_min,views_max,SKU_quantity_mean,SKU_quantity_min,SKU_quantity_max,SKU_views_mean,SKU_views_min,SKU_views_max
0,2022-01-01,Product_21,SKU_34,Grocery,127.347405,15,126.257644,0,2023-11-24,57,...,69,40.250000,1,69,11.875000,9,17,40.250000,1,69
1,2022-01-02,Product_32,SKU_6,Clothing,119.165077,5,129.504709,5,2023-11-24,93,...,95,37.428571,4,95,8.357143,3,13,37.428571,4,95
2,2022-01-03,Product_23,SKU_2,Electronics,104.545228,9,98.015648,20,2023-11-24,40,...,94,46.000000,1,94,9.818182,5,13,46.000000,1,94
3,2022-01-04,Product_33,SKU_13,Grocery,114.959609,5,127.093423,10,2023-11-24,24,...,95,50.428571,3,95,10.142857,5,15,50.428571,3,95
4,2022-01-05,Product_3,SKU_43,Grocery,120.102043,11,120.733836,10,2023-11-24,77,...,99,68.272727,12,99,10.000000,6,14,68.272727,12,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2023-05-11,Product_10,SKU_7,Grocery,107.896515,14,121.971779,20,2023-11-24,96,...,96,46.555556,12,96,12.444444,9,18,46.555556,12,96
496,2023-05-12,Product_17,SKU_33,Grocery,88.375297,15,84.834681,0,2023-11-24,35,...,98,48.800000,3,98,9.400000,4,15,48.800000,3,98
497,2023-05-13,Product_20,SKU_23,Clothing,82.089812,7,73.451166,5,2023-11-24,13,...,98,50.750000,6,98,9.416667,4,20,50.750000,6,98
498,2023-05-14,Product_24,SKU_21,Electronics,88.664097,8,68.512037,5,2023-11-24,77,...,81,44.333333,17,81,11.333333,4,20,44.333333,17,81
