In [56]:
pip install findspark pandas awswrangler numpy matplotlib seaborn pyspark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import findspark
import pandas as pd
import awswrangler as wr
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import matplotlib.ticker as ticker
import boto3
import statistics
import sys
import argparse
import os

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--actualQuantity', '-a', help="Actual Quantity, default = 700", type= int, default=700)
parser.add_argument('--numberOfWeeks ', '-w', help="Number of Weeks, default = 40", type= int, default=40)
parser.add_argument('--numberOfModels', '-nm', help="Number of Models, default = 5", type= int, default=5)
parser.add_argument('--numberOfParts', '-p', help="Number of parts per model, default = 300", type= int, default=300)
parser.add_argument('--icsPenalty', '-ics', help="Installation Conditions penalty, default = 1", type= int, default=1)
parser.add_argument('--colorPenalty', '-c', help="Color Penalty, default = 1", type= int, default=1)
parser.add_argument('--indexPenalty', '-idx', help="Index penalty, default = 1", type= int, default=1)

args = parser.parse_args(sys.argv)
actual_quantity = args.actualQuantity
number_of_weeks = args.numberOfWeeks
number_of_models = args.numberOfModels
number_of_parts = args.numberOfParts

ics_penalty = args.icsPenalty
penalty_per_ic = {
    "0": 1 * ics_penalty,
    "1": 2 * ics_penalty,
    "2": 3 * ics_penalty,
    "3": 4 * ics_penalty,
    "5": 5 * ics_penalty,
    "8": 6 * ics_penalty,
    "13": 7 * ics_penalty,
}
ics = list(penalty_per_ic)
color_penalty = args.colorPenalty
index_penalty = args.indexPenalty

models = []
model_forecasts = {}
for i in range(number_of_models):
    while True : 
        model_id = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890'), 3))
        if model_id not in models:
            models.append(model_id)
            model_forecasts[model_id] = {}
            break

# for each week, create forecast for each model
for i in range(number_of_models):
    model_forecasts[models[i]][1] = [] 
    for j in range(1, number_of_weeks + 1):
        n_week_model_forecast = np.random.normal(actual_quantity, 200, number_of_models)
        model_forecasts[models[i]][1].append({ "partCodeId": "part" + models[i], "vehicleModelId": models[i], "numberOfInstallationConditions": None, "year": 2025, "week": j, "dfQuantity": n_week_model_forecast[0], "actualQuantity": actual_quantity, "forecastWeek": 1, "forecastDistance": j - 1 })

for forecast_week in range(2, number_of_weeks + 1):
    for i in range(number_of_models):
        model_forecasts[models[i]][forecast_week] = []
        for previous_part_df in model_forecasts[models[i]][forecast_week-1]:
            if(previous_part_df["week"] < forecast_week):
                continue
            n_week_model_forecast = np.random.normal(previous_part_df["actualQuantity"], 5 * (previous_part_df["week"] - forecast_week + 1), 1)
            model_forecasts[models[i]][forecast_week].append({ "partCodeId": previous_part_df["partCodeId"], "vehicleModelId": models[i], "numberOfInstallationConditions": None, "year": previous_part_df["year"], "week": previous_part_df["week"], "dfQuantity": n_week_model_forecast[0], "actualQuantity": previous_part_df["actualQuantity"], "forecastWeek": forecast_week, "forecastDistance": previous_part_df["week"] - forecast_week })


# generate number_of_parts parts per model
parts_per_model = {}
for model in models:
    parts_per_model[model] = []
    for i in range(number_of_parts):
        part_id = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890'), 10))
        while part_id in parts_per_model[model]: 
            part_id = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890'), 10))
        for j in range (len(ics)):
            [color] = np.random.choice(['', '', '', 'RED', 'GREEN', 'BLUE', 'GRAY'], 1)
            [index] = np.random.choice([' ', ' ', ' ', '1', '2', '3'], 1)
            numberOfInstallationConditions = ics[j]
            parts_per_model[model].append({ "partCodeId": f"{part_id}{index}{color}", "vehicleModelId": model, "year": 2025, "week": 1, "actualQuantity": actual_quantity / len(ics), "dfQuantity": None,  "numberOfInstallationConditions": numberOfInstallationConditions })

# Create first forecast 
all_parts_with_forecast = {1: []}
for model in models:
    for model_forecast in model_forecasts[model][1]:
        week = model_forecast["week"]
        part_factor_per_subpart = model_forecast["dfQuantity"] / len(ics)

        for part in parts_per_model[model]:
            color_factor = np.random.normal(part_factor_per_subpart, color_penalty if len(part["partCodeId"]) > 11 else 3  , 1)
            index_factor = np.random.normal(part_factor_per_subpart, index_penalty if part["partCodeId"][10] in ['1','2','3'] else 3, 1)
            ic_factor = np.random.normal(part_factor_per_subpart, penalty_per_ic[str(part["numberOfInstallationConditions"])], 1)
            all_factors = (color_factor + index_factor + ic_factor) / 3
            part = { "partCodeId": part["partCodeId"], "vehicleModelId": part["vehicleModelId"], "year": 2025, "week": week, "actualQuantity": part["actualQuantity"], "dfQuantity": all_factors[0],  "numberOfInstallationConditions": part["numberOfInstallationConditions"], "forecastWeek": 1, "forecastDistance": week - 1}
            all_parts_with_forecast[1].append(part)

# create subsequent forecasts for the rest of the weeks, taking the previous into consideration
for forecast_week in range(2, number_of_weeks + 1):
    all_parts_with_forecast[forecast_week] = []
    for part in all_parts_with_forecast[forecast_week - 1]:
        if(part["week"] < forecast_week):
            continue
        for model_forecast in model_forecasts[part["vehicleModelId"]][forecast_week]:
            if model_forecast["week"] == part["week"]:
                part_factor_per_subpart = model_forecast["dfQuantity"] / len(ics)

                color_factor = np.random.normal(part_factor_per_subpart, color_penalty if len(part["partCodeId"]) > 11 else 3, 1)
                index_factor = np.random.normal(part_factor_per_subpart, index_penalty if part["partCodeId"][10] != ' ' else 3, 1)
                ic_factor = np.random.normal(part_factor_per_subpart, penalty_per_ic[str(part["numberOfInstallationConditions"])], 1)
                
                new_df = (color_factor + index_factor + ic_factor) / 3
                
                all_parts_with_forecast[forecast_week].append({ "partCodeId": part["partCodeId"], "vehicleModelId": part["vehicleModelId"], "year": 2025, "week": part["week"], "actualQuantity": part["actualQuantity"], "dfQuantity": new_df[0],  "numberOfInstallationConditions": part["numberOfInstallationConditions"], "forecastWeek": forecast_week, "forecastDistance": part["week"] - forecast_week })
                break
        



all_rows = []
for model in models:
    for forecast_week in range(1, number_of_weeks + 1):
        for model_forecast in model_forecasts[model][forecast_week]:
                all_rows.append(model_forecast)
for forecast_week in range(1, number_of_weeks + 1):
    for part in all_parts_with_forecast[forecast_week]:
        all_rows.append(part)

df = pd.DataFrame(all_rows, columns=["partCodeId", "vehicleModelId", "numberOfInstallationConditions", "year", "week", "dfQuantity", "actualQuantity", "forecastWeek", "forecastDistance"])
#delete ./artificialDS.parquet if it exists in my computer
if os.path.exists("./artificialDS.parquet"):
    os.remove("./artificialDS.parquet")
df.to_parquet("./artificialDS.parquet", index=False)
records_per_part = (number_of_weeks * (number_of_weeks +1)) / 2
print("Success, number of rows should be", (records_per_part * number_of_parts * number_of_models * len(ics)) + (records_per_part * number_of_models)) 

Success, number of rows should be 8614100.0
