In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import pandas as pd
import numpy as np
def generate_synthetic_data(n=1000):


    # Set a seed for reproducibility
    np.random.seed(42)

    # Dates ranging over a year
    dates = pd.date_range(start='2022-01-01', periods=n)

    # Product categories
    categories = ['Electronics', 'Furniture', 'Grocery', 'Clothing']
    product_categories = np.random.choice(categories, n)

    # Products and SKUs
    products = [f'Product_{i}' for i in range(1, 51)]  # 50 products
    product_list = np.random.choice(products, n)
    SKUs = [f'SKU_{i}' for i in range(1, 51)]  # 50 SKUs
    sku_list = np.random.choice(SKUs, n)

    # Prices normally distributed and positive
    prices = np.abs(np.random.normal(loc=100, scale=20, size=n))

    # Quantity sold - we assume that it's a Poisson distribution
    quantity = np.random.poisson(lam=10, size=n)

    # Competitor's price - usually it would be around the actual product price
    competitor_prices = prices + np.random.normal(loc=0, scale=10, size=n)

    # Discounts
    discounts = np.random.choice([0, 5, 10, 15, 20], size=n)  # in percentage

    # Create a dataframe
    df = pd.DataFrame({
        'date': dates,
        'product': product_list,
        'SKU': sku_list,
        'category': product_categories,
        'price': prices,
        'quantity': quantity,
        'competitor_price': competitor_prices,
        'discount': discounts
    })

    # Black Friday dates
    df['black_friday_date'] = pd.to_datetime(['2022-11-25' if date.month >= 11 else '2023-11-24' for date in df['date']])

    # Additional fields for feature engineering
    df['views'] = np.random.randint(0, 100, size=n)
    df['clicks'] = np.random.randint(0, 50, size=n)
    df['favorites'] = np.random.randint(0, 20, size=n)
    df['buys'] = np.random.randint(0, 10, size=n)
    

    #df.to_csv('synthetic_data.csv', index=False)
    print(f"Synthetic data of size {n} generated and saved to 'synthetic_data.csv'")

    return df

# Call the function with a custom size



In [None]:
df = generate_synthetic_data(n=500)

In [None]:
df.head()


In [None]:
import sys
sys.path.append('../src')

In [None]:
from data.feature_engineering import FeatureEngineeringProcess


In [None]:
#date_fe = FeatureEngineeringProcess.datetime_transform
feature_engineering = FeatureEngineeringProcess()
df = feature_engineering.datetime_transform(df, date_feature='date', 
                                            features=['month', 'day', 'day_name', 'week', 'year',
                                                       'quarter', 'season','holidays'],
                                                       )

In [None]:
df.head()

In [None]:
df.date_holidays.value_counts()

In [None]:
from datetime import date
import holidays

us_holidays = holidays.US()  # this is a dict
# the below is the same, but takes a string:
us_holidays = holidays.country_holidays('US')  # this is a dict



date(2015, 1, 1) in us_holidays  # True
date(2015, 1, 2) in us_holidays  # False
us_holidays.get('2014-01-01')  # "New Year's Day"

In [None]:
holidays.CountryHoliday('PL', years=2022).keys()

In [None]:
print(df.head())

In [None]:
feature_engineering = FeatureEngineeringProcess()

In [None]:
df.head()

In [None]:
df.head()

In [None]:
feature_engineering.grouped_feature_eng(df, group_features=['SKU'], features=['quantity', 'views'])


In [1]:
import pandas as pd
looker = pd.read_csv('merged_df.csv')

In [2]:
looker.head()

Unnamed: 0,sku,sale_price,date,daily_sales,status,created_at,shipped_at,delivered_at,returned_at,gender,cost,category,name,brand,retail_price,department
0,00003E3B9E5336685200AE85D21B4F5E,99.0,2022-05-14,2,Shipped,2022-05-14 21:24:59+00:00,2022-05-15 20:22:00+00:00,,,F,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,99.0,Women
1,00003E3B9E5336685200AE85D21B4F5E,99.0,2022-12-09,1,Complete,2022-12-09 03:58:35+00:00,2022-12-10 03:49:00+00:00,2022-12-10 13:54:00+00:00,,F,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,99.0,Women
2,00003E3B9E5336685200AE85D21B4F5E,99.0,2023-05-12,1,Complete,2023-05-12 23:08:53+00:00,2023-05-15 07:59:00+00:00,2023-05-18 01:27:00+00:00,,F,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,99.0,Women
3,0004D0B59E19461FF126E3A08A814C33,79.949997,2020-08-31,1,Processing,2020-08-31 08:42:06+00:00,,,,F,37.656449,Fashion Hoodies & Sweatshirts,The Bradford Exchange Breast Cancer Support Wo...,Bradford Exchange,79.949997,Women
4,0004D0B59E19461FF126E3A08A814C33,79.949997,2022-10-19,1,Shipped,2022-10-19 10:38:15+00:00,2022-10-21 14:40:00+00:00,,,F,37.656449,Fashion Hoodies & Sweatshirts,The Bradford Exchange Breast Cancer Support Wo...,Bradford Exchange,79.949997,Women


## TEST DATABASE

In [4]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(0)

product_names = ["Anne Klein Women's Crop Pant",
 "Anne Klein Women's Crop Pant",
 "Anne Klein Women's Crop Pant",
 "The Bradford Exchange Breast Cancer Support Women's Hoodie: Celebrate Life White",
 "The Bradford Exchange Breast Cancer Support Women's Hoodie: Celebrate Life White",
 "The Bradford Exchange Breast Cancer Support Women's Hoodie: Celebrate Life White",
 "The Bradford Exchange Breast Cancer Support Women's Hoodie: Celebrate Life White",
 "The Bradford Exchange Breast Cancer Support Women's Hoodie: Celebrate Life White",
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 'Paul Fredrick Twill Weave Double-Breasted Suit Jacket',
 "IZOD Men's Essential V-Neck Sweater",
 "IZOD Men's Essential V-Neck Sweater",
 "IZOD Men's Essential V-Neck Sweater",
 "IZOD Men's Essential V-Neck Sweater",
 "IZOD Men's Essential V-Neck Sweater",
 "IZOD Men's Essential V-Neck Sweater",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Toddland Men's Snuggle Snake Boxer Briefs",
 "Hue Sleepwear Women's Rhine Drinks Short Sleeve Pajama Set",
 "Hue Sleepwear Women's Rhine Drinks Short Sleeve Pajama Set",
 "Hue Sleepwear Women's Rhine Drinks Short Sleeve Pajama Set",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 "True Religion Women's Cameron Boyfriend Jean",
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Summer Zebra Patch Animal Print Full Leggings Long Stretchy Footless Brown',
 'Half Slip Black24-L',
 'Half Slip Black24-L',
 'Half Slip Black24-L',
 'Half Slip Black24-L']

categories = ['Pants & Capris',
 'Pants & Capris',
 'Pants & Capris',
 'Fashion Hoodies & Sweatshirts',
 'Fashion Hoodies & Sweatshirts',
 'Fashion Hoodies & Sweatshirts',
 'Fashion Hoodies & Sweatshirts',
 'Fashion Hoodies & Sweatshirts',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Suits & Sport Coats',
 'Sweaters',
 'Sweaters',
 'Sweaters',
 'Sweaters',
 'Sweaters',
 'Sweaters',
 'Underwear',
 'Underwear',
 'Underwear',
 'Underwear',
 'Underwear',
 'Underwear',
 'Underwear',
 'Underwear',
 'Sleep & Lounge',
 'Sleep & Lounge',
 'Sleep & Lounge',
 'Jeans',
 'Jeans',
 'Jeans',
 'Jeans',
 'Jeans',
 'Jeans',
 'Jeans',
 'Jeans',
 'Leggings',
 'Leggings',
 'Leggings',
 'Leggings',
 'Leggings',
 'Leggings',
 'Intimates',
 'Intimates',
 'Intimates',
 'Intimates']

# Number of records
num_records = 50

# SKU: Generating unique SKU codes
sku_codes = [f"SKU{i:05d}" for i in range(num_records)]


product_names = product_names

category = categories

# Category_id: Randomly assigning category IDs (assuming 20 different categories)
category_ids = np.random.choice(range(1, 21), size=num_records)

# ML Status: Randomly assigned statuses
ml_statuses = np.random.choice(["REPRICED", "MATCHED", "NOT_FOUND", "NEED_ACTION"], size=num_records)

# Price: Randomly generated prices within a realistic range ($5 to $1000)
prices = np.random.uniform(5, 1000, size=num_records).round(2)

# Discount: Randomly generated discounts (0% to 50%)
discounts = np.random.uniform(0, 0.5, size=num_records).round(2)

# Cost: Randomly generated, but always less than the selling price
costs = (prices * np.random.uniform(0.5, 0.9)).round(2)

# Daily_Sales: Randomly generated number of daily sales (1 to 100)
daily_sales = np.random.randint(1, 101, size=num_records)

# Daily_Revenue: Derived from daily sales and price
daily_revenue = (prices * daily_sales).round(2)

# Daily_Profit: Derived from daily revenue and cost
daily_profit = (daily_revenue - costs * daily_sales).round(2)
# Potential: -/+% of profit that can be achieved by repricing
potential = np.random.uniform(-0.1, 0.1, size=num_records).round(2)

# Create DataFrame
product_list = pd.DataFrame({
    'SKU': sku_codes,
    'Name': product_names,
    'Category': category,
    'Category_id': category_ids,
    'ML_Status': ml_statuses,
    'Price': prices,
    'Discount': discounts,
    'Cost': costs,
    'Daily_Sales': daily_sales,
    'Daily_Revenue': daily_revenue,
    'Daily_Profit': daily_profit,
    'Potential': potential
})

product_list.head()


Unnamed: 0,SKU,Name,Category,Category_id,ML_Status,Price,Discount,Cost,Daily_Sales,Daily_Revenue,Daily_Profit,Potential
0,SKU00000,Anne Klein Women's Crop Pant,Pants & Capris,13,REPRICED,676.38,0.49,451.84,51,34495.38,11451.54,0.09
1,SKU00001,Anne Klein Women's Crop Pant,Pants & Capris,16,REPRICED,320.62,0.06,214.18,4,1282.48,425.76,0.04
2,SKU00002,Anne Klein Women's Crop Pant,Pants & Capris,1,MATCHED,779.45,0.38,520.7,32,24942.4,8280.0,-0.08
3,SKU00003,The Bradford Exchange Breast Cancer Support Wo...,Fashion Hoodies & Sweatshirts,4,NEED_ACTION,949.82,0.21,634.51,10,9498.2,3153.1,-0.09
4,SKU00004,The Bradford Exchange Breast Cancer Support Wo...,Fashion Hoodies & Sweatshirts,4,REPRICED,664.21,0.34,443.71,11,7306.31,2425.5,-0.0


In [5]:
np.random.seed(0)

# Set forecast period
#forecast_period = 10


# Function to generate realistic synthetic demand curves
def generate_realistic_demand_curve(daily_metric, base_price, price_multipliers):
    # Generate demand predictions around the daily_metric with some variability and constraints
    demand_predictions = [min(max(daily_metric * np.random.uniform(0.9, 1.1), daily_metric * 0.8), daily_metric * 1.2) for _ in price_multipliers]
    price_points = [base_price * factor for factor in price_multipliers]
    return price_points, demand_predictions

# Generating realistic synthetic demand curves for each SKU
ml_curve_sales_realistic = []
ml_curve_profit_realistic = []

for index, row in product_list.iterrows():
    # For sales curve, use Daily_Sales as the daily metric
    _, sales_predictions = generate_realistic_demand_curve(row['Daily_Sales'], row['Price'], price_multipliers)
    ml_curve_sales_realistic.append(sales_predictions)
    
    # For profit curve, use Daily_Profit as the daily metric
    _, profit_predictions = generate_realistic_demand_curve(row['Daily_Profit'], row['Price'], price_multipliers)
    ml_curve_profit_realistic.append(profit_predictions)

# Add realistic ML Curve columns to the DataFrame
product_list['ML_Curve_Sales'] = ml_curve_sales_realistic
product_list['ML_Curve_Profit'] = ml_curve_profit_realistic


# Display sample data with realistic ML curves
#product_list[['SKU', 'Daily_Sales', 'Daily_Profit', 'ML_Curve_Sales_Realistic', 'ML_Curve_Profit_Realistic']].head()




NameError: name 'price_multipliers' is not defined

In [16]:
product_list.head()

Unnamed: 0,SKU,Name,Category_id,ML_Status,Price,Discount,Cost,Daily_Sales,Daily_Revenue,Daily_Profit,Potential,ML_Curve_Sales,ML_Curve_Profit
0,SKU00000,Product000,13,REPRICED,676.38,0.49,451.84,51,34495.38,11451.54,0.09,"[51.49789774005872, 53.19493153699869, 52.0481...","[12119.680188521033, 11517.718265869435, 11607..."
1,SKU00001,Product001,16,REPRICED,320.62,0.06,214.18,4,1282.48,425.76,0.04,"[4.382894673786211, 4.239326851373379, 3.96918...","[405.7114394819332, 449.1115471227024, 422.026..."
2,SKU00002,Product002,1,MATCHED,779.45,0.38,520.7,32,24942.4,8280.0,-0.08,"[31.100850563672232, 31.597004504315787, 33.26...","[8396.24585181201, 8178.324106293603, 9088.747..."
3,SKU00003,Product003,4,NEED_ACTION,949.82,0.21,634.51,10,9498.2,3153.1,-0.09,"[9.317939167291039, 9.220750282328611, 10.3126...","[3453.5648678267476, 3133.3308207830737, 3453...."
4,SKU00004,Product004,4,REPRICED,664.21,0.34,443.71,11,7306.31,2425.5,-0.0,"[10.599562994666748, 10.811378587932275, 10.04...","[2337.4877988341373, 2506.710775320382, 2246.8..."


In [20]:
# Step 3: Calculate Forecast_Sales, Forecast_Profit, and Forecast_Revenue

# Set the forecast period (for example, 10 days)
forecast_period = 10

# Function to calculate forecasts based on ML curves
def calculate_forecasts(ml_curve, daily_metric, forecast_period):
    # Using the median of ML curve for the forecast
    forecast = np.median(ml_curve) * forecast_period
    # Applying constraints to ensure forecast is within a reasonable range of the daily metric
    forecast = min(max(forecast, daily_metric * forecast_period * 0.7), daily_metric * forecast_period * 0.8)
    return forecast

# Calculate Forecast_Sales, Forecast_Profit, and Forecast_Revenue
forecast_sales = []
forecast_profit = []
forecast_revenue = []

for index, row in product_list.iterrows():
    # Calculate forecasts based on the adjusted ML curves
    forecast_sales.append(calculate_forecasts(row['ML_Curve_Sales'], row['Daily_Sales'], forecast_period))
    forecast_profit.append(calculate_forecasts(row['ML_Curve_Profit'], row['Daily_Profit'], forecast_period))
    
    # Calculate forecast revenue based on forecast sales and price
    forecast_revenue.append(forecast_sales[-1] * row['Price'])

# Add Forecast columns to the DataFrame
product_list['Forecast_Sales'] = forecast_sales
product_list['Forecast_Profit'] = forecast_profit
product_list['Forecast_Revenue'] = forecast_revenue

# Now, you can proceed to analyze or save the DataFrame


In [23]:
product_list.tail()

Unnamed: 0,SKU,Name,Category_id,ML_Status,Price,Discount,Cost,Daily_Sales,Daily_Revenue,Daily_Profit,Potential,ML_Curve_Sales,ML_Curve_Profit,Forecast_Sales,Forecast_Profit,Forecast_Revenue
45,SKU00045,Product045,1,NEED_ACTION,896.56,0.21,598.93,3,2689.68,892.89,0.05,"[3.1865031690773713, 2.908915165647912, 2.8268...","[964.2974032314594, 813.7580271836592, 978.694...",24.0,7143.12,21517.44
46,SKU00046,Product046,1,NOT_FOUND,640.73,0.07,428.03,70,44851.1,14889.0,0.03,"[72.20246483263999, 70.24256516922513, 69.7895...","[16089.49901839461, 13527.91965479498, 14391.9...",560.0,119112.0,358808.8
47,SKU00047,Product047,5,NEED_ACTION,892.1,0.02,595.95,13,11597.3,3849.95,-0.04,"[12.064998290278302, 12.291142367601505, 11.87...","[3632.5532128340237, 4214.785658384439, 3589.9...",104.0,30799.6,92778.4
48,SKU00048,Product048,6,NEED_ACTION,681.66,0.49,455.37,45,30674.7,10183.05,-0.07,"[42.48093474702816, 46.106046289671774, 41.503...","[10661.274081550346, 9524.20542040535, 11077.4...",360.0,81464.4,245397.6
49,SKU00049,Product049,7,MATCHED,451.95,0.19,301.92,67,30280.65,10052.01,0.04,"[72.65103549606917, 71.09053566476095, 70.0008...","[9071.277918261041, 9695.82614794575, 9508.331...",536.0,80416.08,242245.2


In [22]:
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(0)

# Number of price changes per SKU
num_price_changes_per_sku = 5

# SKU: Use SKUs from the Product_List dataset
sku_codes_history = np.repeat(sku_codes, num_price_changes_per_sku)

# Last_Price: Randomly select a past price for each SKU (let's assume between 80% and 120% of current price)
last_prices = [price * np.random.uniform(0.8, 1.2) for price in product_list['Price']] * num_price_changes_per_sku

# New_Price: Generate a new price based on the last price (let's assume between 95% and 105% of last price)
new_prices = [last_price * np.random.uniform(0.95, 1.05) for last_price in last_prices]

# Date: Generate random dates within a range (let's assume within the last year)
start_date = datetime.now() - timedelta(days=365)
date_range = [start_date + timedelta(days=np.random.randint(0, 365)) for _ in sku_codes_history]

# Create DataFrame
price_changes_history = pd.DataFrame({
    'SKU': sku_codes_history,
    'Last_Price': last_prices,
    'New_Price': new_prices,
    'Date': date_range
})

# Sort DataFrame by SKU and Date
price_changes_history.sort_values(by=['SKU', 'Date'], inplace=True)
price_changes_history.reset_index(drop=True, inplace=True)

# Display sample data
price_changes_history.head(10)


Unnamed: 0,SKU,Last_Price,New_Price,Date
0,SKU00000,811.489565,851.120593,2022-11-21 17:19:35.585636
1,SKU00000,966.872378,928.39519,2022-12-11 17:19:35.585636
2,SKU00000,643.926302,625.18011,2023-07-01 17:19:35.585636
3,SKU00000,348.217606,346.079602,2023-09-21 17:19:35.585636
4,SKU00000,689.586591,694.427266,2023-10-04 17:19:35.585636
5,SKU00001,19.579616,18.916473,2023-01-08 17:19:35.585636
6,SKU00001,609.133543,618.459885,2023-01-28 17:19:35.585636
7,SKU00001,837.827363,816.414639,2023-07-10 17:19:35.585636
8,SKU00001,1152.378773,1148.496498,2023-09-06 17:19:35.585636
9,SKU00001,781.114156,761.843414,2023-09-13 17:19:35.585636


In [29]:
price_changes_history.head()

Unnamed: 0,SKU,Last_Price,New_Price,Date
0,SKU00000,811.489565,851.120593,2022-11-21 17:19:35.585636
1,SKU00000,966.872378,928.39519,2022-12-11 17:19:35.585636
2,SKU00000,643.926302,625.18011,2023-07-01 17:19:35.585636
3,SKU00000,348.217606,346.079602,2023-09-21 17:19:35.585636
4,SKU00000,689.586591,694.427266,2023-10-04 17:19:35.585636


In [30]:
product_list.to_csv('product_list.csv', index=False)
price_changes_history.to_csv('price_changes_history.csv', index=False)

In [20]:
product_list = pd.read_csv('product_list.csv')
price_changes_history = pd.read_csv('price_changes_history.csv')

In [21]:
product_list['Name'] = product_names 
product_list['Category'] = categories

In [22]:
price_changes_history.head()

Unnamed: 0,SKU,Last_Price,New_Price,Date
0,SKU00000,811.489565,851.120593,2022-11-21 17:19:35.585636
1,SKU00000,966.872378,928.39519,2022-12-11 17:19:35.585636
2,SKU00000,643.926302,625.18011,2023-07-01 17:19:35.585636
3,SKU00000,348.217606,346.079602,2023-09-21 17:19:35.585636
4,SKU00000,689.586591,694.427266,2023-10-04 17:19:35.585636


In [27]:
import sqlite3

conn_pc = sqlite3.connect('price_changes_history.db')


In [28]:
price_changes_history.to_sql('price_changes_history', conn_pc, if_exists='replace', index=False)


250

In [30]:
query = pd.read_sql_query("SELECT * FROM price_changes_history", conn_pc)
print(query.head())


        SKU  Last_Price   New_Price                        Date
0  SKU00000  811.489565  851.120593  2022-11-21 17:19:35.585636
1  SKU00000  966.872378  928.395190  2022-12-11 17:19:35.585636
2  SKU00000  643.926302  625.180110  2023-07-01 17:19:35.585636
3  SKU00000  348.217606  346.079602  2023-09-21 17:19:35.585636
4  SKU00000  689.586591  694.427266  2023-10-04 17:19:35.585636


In [31]:
conn_pc.close()


In [None]:
# download product_list.db

import pandas as pd
import sqlite3




