In [1]:
import pyspark.sql.functions as F
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
import pandas as pd

from feature_engineering.engineering import engineerFeatures
from utils.utils import gapfilling, serialize
from modelling.model_utils import splitData, prepare_data, train_model, MLFlow_train_model
import mlflow

In [2]:
spark = SparkSession\
            .builder\
            .appName("test-app")\
            .getOrCreate()

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

/usr/local/lib/python3.10/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/05 22:42:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# [TODO]: move config dictionnaries to a json config file
features_config = {

    "discount_rate":{},
    "promoted_percent":{"promoted_hierarchy": "sku", "group_key":"subclass"},
    "week_of_year":{},
    "promo_category":{},
}

model_config={
    "model": "xgboost",
    "params": {},
    "hierarchy_columns": ["sku", "subclass", "store_id", "region_id"],
    "target": "units",
    "train_startDate": "2018-01-01",
    "train_endDate": "2020-01-01",
    "inference_startDate": "2019-11-01",
    "inference_endDate": "2020-12-21",
}

path = "data/"

In [None]:
# Union transcational data

schema = StructType(
    [StructField("customer_id", StringType(), True),
    StructField("week_index", StringType(), True),
    StructField("sku", StringType(), True),
    StructField("promo_cat", StringType(), True),
    StructField("discount", FloatType(), True),
    StructField("store_id", StringType(), True)],
)

transactions = spark.read.csv(
    "data/transactions_*.csv", 
    schema=schema,
    header=False
)

In [None]:
# Reading data
customers = spark.read.csv(
    "data/customers.csv", 
    header="true", 
    inferSchema="true")

calendar = spark.read.csv(
    "data/calendar.csv", 
    header="true", 
    inferSchema="true")

products = spark.read.csv(
    "data/products.csv", 
    header="true", 
    inferSchema="true")

stores = spark.read.csv(
    "data/stores.csv", 
    header="true", 
    inferSchema="true")

In [None]:
# Adding location hierarchy for customers

customers = customers.select(
    F.col("customer_id").cast("string"), 
    F.col("store_pref").cast("string").alias("store_id")
)

stores = stores.select(
    F.col("store_id").cast("string"), 
    F.col("store_region").cast("string").alias("region_id")
).dropDuplicates()

In [None]:
products = products.select(
    F.col("prod_id").cast("string").alias("sku"),
    F.col("prod_subclass").cast("string").alias("subclass"),
    F.col("prod_class").cast("string").alias("class"),
    F.col("prod_dept").cast("string").alias("dept"),
    F.col("prod_base_price").cast("float").alias("base_price"),
).dropDuplicates()

products.show(5)

In [None]:
# daily calendar -> weekly calendar
weekly_calendar = calendar.where(
    F.col("day_of_week")=="0"
).select(
    F.to_date(F.col("calendar_day"),"MM-dd-yy").alias("date")
).distinct(
).sort(
    F.col("date").asc()
).withColumn(
    "week_index", F.monotonically_increasing_id()
).select(
    F.col("week_index").cast("string"),
    F.col("date")
)
weekly_calendar.show(5)

In [None]:
# add hierarchies
demand_data = transactions.groupby(
    "sku", "store_id", "week_index"
).agg(
    F.count("*").alias("units"),
    F.first("promo_cat").alias("promo_cat"),
    F.max("discount").alias("discount"),
).join(
    weekly_calendar, on=["week_index"], how="inner"
).drop("week_index")

serialize(spark, demand_data, path + "demand_data.parquet").show(5)


In [None]:
demand_data = spark.read.parquet(path + "demand_data.parquet", header="true", inferSchema="true")

sales_filled_data = gapfilling(demand_data, date_column="date", product_column="sku", location_column="store_id")

serialize(spark, sales_filled_data, path + "sales_filled_data.parquet").show(5)


In [None]:
# Adding product and location hierarchies to demand data
demand_data = spark.read.parquet(path + "sales_filled_data.parquet", header="true", inferSchema="true")

sales_data = demand_data.join(
    stores, on="store_id", how="inner"
).join(
    products, on="sku", how="inner"
)

serialize(spark, sales_data, path + "sales_data.parquet").show(5)

## Feature Engineering

In [None]:
sales_data = spark.read.parquet(path + "sales_data.parquet", header="true", inferSchema="true")

engineered_data = engineerFeatures(
    data=sales_data,
    config=features_config
)

serialize(spark, engineered_data, path + "engineered_data.parquet").show(5)

## Modelling

In [None]:
engineered_data = spark.read.parquet(path + "engineered_data.parquet", header="true", inferSchema="true")
engineered_data = engineered_data.where(F.col("sku")<400) # Just to reduce the size of the data for less memory consumption

train_data, test_data = splitData(
        data=engineered_data,
        model_config=model_config,
        features_config=features_config
    )

serialize(spark, train_data, path + "train_data.parquet").show(5)
serialize(spark, test_data, path + "test_data.parquet").show(5)

In [4]:
train_data = pd.read_parquet(f"data/train_data.parquet", "pyarrow")
test_data = pd.read_parquet(f"data/test_data.parquet", "pyarrow")

# preparing the training data
X_train_ohe_sparse, y_train = prepare_data(
    train_data,
    model_config,
    features_config,
    prefix="train",
)

# preparing the inferencing data
X_test_ohe_sparse, y_test = prepare_data(
    test_data,
    model_config,
    features_config,
    prefix="test",
)

[2023-01-05 22:43:12.060061] One-hot encoding the train dataframe
[2023-01-05 22:53:07.483322] Transforming the train one-hot encoded data into a CSR matrix
[2023-01-05 22:54:36.375337] One-hot encoding the test dataframe
[2023-01-05 22:54:50.824702] Transforming the test one-hot encoded data into a CSR matrix


In [6]:
# Normal run
train_model(X_train_ohe_sparse, X_test_ohe_sparse, y_train, y_test, model_config)

[2023-01-05 22:55:02.872796] Loading the lgbm model
[2023-01-05 22:55:02.873179] Fitting the lgbm model
[2023-01-05 22:55:15.723334] Generating predictions
[2023-01-05 22:55:16.966375] mean_squared_error_ =0.12101342410404359


array([-0.00042651, -0.00042651, -0.00042651, ..., -0.00040733,
       -0.00040733, -0.00040733])

In [6]:
# MLFlow run

max_depth_list = [3,4]
learning_rate_list = [0.1, 0.001]
n_estimators_list = [20, 25]

for max_depth, learning_rate, n_estimators in zip(max_depth_list, learning_rate_list, n_estimators_list):
    model_params = {"max_depth":max_depth, "learning_rate":learning_rate, "n_estimators":n_estimators}
    MLFlow_train_model(X_train_ohe_sparse, X_test_ohe_sparse, y_train, y_test, model_config, model_params)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



[2023-01-01 13:14:17.924195] Logged data and model in run 9c508d0dc958443683498caaf3851e93
[2023-01-01 13:14:17.924471] [Run 9c508d0dc958443683498caaf3851e93]: Loading the xgboost model
[2023-01-01 13:14:17.924578] [Run 9c508d0dc958443683498caaf3851e93]: Fitting the xgboost model
[2023-01-01 13:14:25.180253] [Run 9c508d0dc958443683498caaf3851e93]: Generating predictions
[2023-01-01 13:14:25.704440] [Run 9c508d0dc958443683498caaf3851e93]: mean_squared_error_=0.13993235249196243




[2023-01-01 13:14:31.143475] Logged data and model in run 3fbc42fcaf704286ab8858a25823d2b6
[2023-01-01 13:14:31.143645] [Run 3fbc42fcaf704286ab8858a25823d2b6]: Loading the xgboost model
[2023-01-01 13:14:31.143723] [Run 3fbc42fcaf704286ab8858a25823d2b6]: Fitting the xgboost model
[2023-01-01 13:14:39.110172] [Run 3fbc42fcaf704286ab8858a25823d2b6]: Generating predictions
[2023-01-01 13:14:39.655306] [Run 3fbc42fcaf704286ab8858a25823d2b6]: mean_squared_error_=0.47039567296478074


In [8]:
train_model(X_train_ohe_sparse, X_test_ohe_sparse, y_train, y_test, model_config, run_id="9c508d0dc958443683498caaf3851e93")

[2023-01-01 13:23:52.822766] Loading the pretrained xgboost model in run 9c508d0dc958443683498caaf3851e93
[2023-01-01 13:23:53.398545] Generating predictions
[2023-01-01 13:23:53.846542] mean_squared_error_ =0.13993235249196243


array([0.06079819, 0.06079819, 0.06079819, ..., 0.06079819, 0.06079819,
       0.06079819], dtype=float32)

23/01/01 14:25:45 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3695768 ms exceeds timeout 120000 ms
23/01/01 14:25:46 WARN SparkContext: Killing executors is not supported by current scheduler.


In [8]:
! mlflow ui

[2022-12-31 21:12:59 +0000] [11766] [INFO] Starting gunicorn 20.1.0
[2022-12-31 21:12:59 +0000] [11766] [INFO] Listening at: http://127.0.0.1:5000 (11766)
[2022-12-31 21:12:59 +0000] [11766] [INFO] Using worker: sync
[2022-12-31 21:12:59 +0000] [11767] [INFO] Booting worker with pid: 11767
[2022-12-31 21:12:59 +0000] [11768] [INFO] Booting worker with pid: 11768
[2022-12-31 21:12:59 +0000] [11769] [INFO] Booting worker with pid: 11769
[2022-12-31 21:12:59 +0000] [11770] [INFO] Booting worker with pid: 11770
^C
[2022-12-31 21:13:02 +0000] [11766] [INFO] Handling signal: int
[2022-12-31 21:13:02 +0000] [11769] [INFO] Worker exiting (pid: 11769)
[2022-12-31 21:13:02 +0000] [11768] [INFO] Worker exiting (pid: 11768)
[2022-12-31 21:13:02 +0000] [11767] [INFO] Worker exiting (pid: 11767)
[2022-12-31 21:13:02 +0000] [11770] [INFO] Worker exiting (pid: 11770)
