<div style="background-color: darkgreen; padding: 10px; color: #F7FF80; border-bottom: 10px solid #BA5E02; border-radius: 10px;">
    <h2 style="margin: 0; padding: 0;">TABLE OF CONTENTS</h2>
</div>

* [<span style="color: darkgreen;">IMPORTS</span>](#1)
* [<span style="color: darkgreen;">INTRODUCTION</span>](#2)
* [<span style="color: darkgreen;">DATA PROCESSING</span>](#3)
* [<span style="color: darkgreen;">PREDICT MEAN QUANTITY</span>](#4)
* [<span style="color: darkgreen;">PREDICT QUANTITY FOR STORES</span>](#5)
* [<span style="color: darkgreen;">CONCLUSION</span>](#6)

<a id="1"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> Package Imports<br><div> 

In [None]:
import pandas as pd
import numpy as np
import shap

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from keras import Sequential, layers

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> Introduction<br><div> 

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
<p>&emsp; 
    This notebook is for predicting sales quantity in our Dynamic Gridworld challenge. Participants aim to predict product sales in a dynamic 2D world comprising cities and stores.<br>
    The evaluation metric used is the <b>Root Mean Squared Error (RMSE)</b></p>

<p>The approach involves creating two models:</p>
<ul>
   <li>The first model predicts the mean quantity value for each store in each city.</li>
   <li>The second model predicts the quantity for a store relative to the mean quantity in the city.</li>
</ul>
</div>

<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> Data Processing<br><div> 

In [None]:
data_path = "/kaggle/input/predicting-sales-quantity-in-our-dynamic-gridworld"

train = pd.read_csv(f"{data_path}/train.csv")
test = pd.read_csv(f"{data_path}/test.csv")
sup = pd.read_csv(f"{data_path}/supplemental_cities.csv")

train = pd.merge(train, sup, on="city_id")
test = pd.merge(test, sup, on="city_id")

In [None]:
train["type"] = 0
test["type"] = 1
all_data = pd.concat([train, test], axis=0)

In [None]:
sns.boxplot(y=all_data["price"], x=all_data["ad_level"])
plt.show()
sns.boxplot(y=all_data["ad_level"], x=all_data["education_level"])
plt.show()

In [None]:
all_data["price"].fillna(6.0, inplace=True)
all_data["ad_level"].fillna(3.0, inplace=True)

all_data["population"] = all_data["population"].fillna(train["population"].mean())
all_data["median_income"] = all_data["median_income"].fillna(train["median_income"].mean())

all_data["ad_level"] += 1

In [None]:
all_data["no_stores"] = all_data.groupby("city_id").transform("size")

all_data["x"] = all_data["city_id"].str.split("/").str[0]
all_data["x"] = all_data["x"].astype(int) + 1
all_data["y"] = all_data["city_id"].str.split("/").str[1]
all_data["y"] = all_data['y'].astype(int) + 1
all_data["length"] = (all_data["x"] ** 2 + all_data["y"] ** 2) ** 0.5
all_data["cos"] = all_data["x"] / all_data["length"]
all_data["city_nr"] = all_data["city_id"].str.split("/").str[2]
all_data["city_nr"] = all_data["city_nr"].astype(int)

all_data["education_to_population_ratio"] = all_data["education_level"] / all_data["population"]
all_data["population_education_product"] = all_data["population"] * all_data["education_level"]

all_data["price_ad_interaction"] = all_data["price"] * all_data["ad_level"]
all_data["price_population_interaction"] = all_data["price"] * all_data["population"]
all_data["price_education_interaction"] = all_data["price"] * all_data["education_level"]
all_data["price_income_interaction"] = all_data["price"] * all_data["median_income"]

all_data["mean_price_ad_interaction"] = all_data.groupby("city_id")["price_ad_interaction"].transform("mean")
all_data["mean_price_population_interaction"] = all_data.groupby("city_id")["price_population_interaction"].transform(
    "mean")
all_data["mean_price_education_interaction"] = all_data.groupby("city_id")["price_education_interaction"].transform(
    "mean")
all_data["mean_price_income_interaction"] = all_data.groupby("city_id")["price_income_interaction"].transform("mean")

all_data["mean_price"] = all_data.groupby("city_id")["price"].transform("mean")
all_data["mean_ad_level"] = all_data.groupby("city_id")["ad_level"].transform("mean")
all_data["mean_quantity"] = all_data.groupby("city_id")["quantity"].transform("mean")

<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> Predict Mean Quantity<br><div> 

In [None]:
cities_df = all_data.copy()
cities_df.drop_duplicates("city_id", inplace=True)
test_cities = cities_df[cities_df["type"] == 1][["city_id"]]

In [None]:
drop_list = ["id", "city_id", "store_id", "ad_level", "price", "quantity", "price_ad_interaction",
             "price_population_interaction", "price_education_interaction", "price_income_interaction"]
cities_df.drop(columns=drop_list, inplace=True)

In [None]:
cities_df.head(6)

In [None]:
sns.heatmap(cities_df.corr(numeric_only=True), robust=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
scaler = MinMaxScaler((0, 1))

scaler_list = cities_df.columns.drop("mean_quantity")
cities_df[scaler_list] = scaler.fit_transform(cities_df[scaler_list])
cities_df["mean_quantity"] = scaler.fit_transform(cities_df[["mean_quantity"]])  # save this scaler to inverse predict in the future

In [None]:
train = cities_df[cities_df.type == 0]
train = train.drop(columns=["type"])
train.reset_index(inplace=True, drop=True)
test = cities_df[cities_df.type == 1]
test = test.drop(columns=["type", "mean_quantity"])
test.reset_index(inplace=True, drop=True)

In [None]:
y = train[["mean_quantity"]]
X = train.drop(columns="mean_quantity")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
def get_mean_quantity_predict_model() -> Sequential:
    m = Sequential()
    m.add(layers.Dense(32, activation="relu", input_shape=(X.shape[-1],)))  # 28
    m.add(layers.Dense(24, activation="relu"))  # 14
    m.add(layers.Dense(6, activation="relu"))
    m.add(layers.Dense(1, activation="linear"))

    m.compile(optimizer="adam", loss="mean_squared_error")
    return m

epochs = 160
batch_size = 54

In [None]:
model = get_mean_quantity_predict_model()
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2, validation_split=0.15)

y_test_predict = scaler.inverse_transform(model.predict(X_test))
y_test_true = scaler.inverse_transform(y_test)
print(f"Mean Squared Error: {np.sqrt(mean_squared_error(y_test_true, y_test_predict))}")

In [None]:
final_model = get_mean_quantity_predict_model()
final_model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=0)

In [None]:
test_predict = scaler.inverse_transform(final_model.predict(test))
test_cities["mean_quantity"] = test_predict

In [None]:
all_data = all_data.merge(test_cities, on="city_id", how="left")

nan_mask = all_data["mean_quantity_x"].isna()
all_data.loc[nan_mask, "mean_quantity_x"] = all_data[nan_mask]["mean_quantity_y"]
all_data.drop(columns="mean_quantity_y", inplace=True)
all_data.head(6)

<a id="5"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> Predict Quantity for Stores<br><div> 

In [None]:
columns = ["id", "city_id", "store_id", "price", "ad_level", "quantity", "type", 
           "no_stores", "city_nr", "mean_quantity_x"]
stores_data = all_data[columns].copy()

In [None]:
stores_data.head(6)

In [None]:
stores_data.loc[:, "price"] /= stores_data.groupby("city_id")["price"].transform("max")
stores_data.loc[:, "ad_level"] /= stores_data.groupby("city_id")["ad_level"].transform("max")
stores_data.loc[:, "quantity"] /= stores_data["mean_quantity_x"]

stores_data.loc[:, "no_stores"] /= stores_data["no_stores"].max()
stores_data.loc[:, "city_nr"] /= stores_data["city_nr"].max()

In [None]:
stores_data.head(6)

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
The main idea is to restructure and encode store-related data for each city. This restructuring involves pivoting the data, so each city has columns for ad_level, price, quantity, and id, for each store. After this transformation, duplicate city entries are dropped to retain only unique city records.<br>
</div>

In [None]:
for c, suff in zip(["ad_level", "price", "quantity", "id"], [("", ""), ("_a", "_p"), ("", ""), ("_q", "_id")]):
    pivot_df = stores_data.pivot(index="city_id", columns="store_id", values=[c])
    if c != "id":
        pivot_df.fillna(0, inplace=True)
    stores_data = stores_data.merge(pivot_df[c], on="city_id", how="left", suffixes=suff)

mean_quantity_df = stores_data[["id", "mean_quantity_x"]]
stores_data.drop_duplicates("city_id", inplace=True)

ids = [f"{i}_id" for i in range(30)]
test_ids = stores_data[stores_data["type"] == 1][ids].values

stores_data.drop(columns=["id", "city_id", "store_id", "ad_level", "price", "quantity", *ids], inplace=True)

In [None]:
stores_data.sort_values("no_stores", ascending=False).head(6)

In [None]:
sns.heatmap(stores_data.corr(numeric_only=True), robust=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
train = stores_data[stores_data.type == 0]
train = train.drop(columns=["type"])
train.reset_index(inplace=True, drop=True)
test = stores_data[stores_data.type == 1]
test = test.drop(columns=["type"])
test.reset_index(inplace=True, drop=True)

In [None]:
test.head(6)

In [None]:
quantity_columns = [f"{i}_q" for i in range(30)]
test.drop(columns=[*quantity_columns, "mean_quantity_x"], inplace=True)

y = train[quantity_columns]
X = train.drop(columns=quantity_columns)

In [None]:
test_len = -int(12583 * 0.75)
X_train = X[test_len:].drop(columns="mean_quantity_x")
X_test = X[:test_len]
X.drop(columns="mean_quantity_x", inplace=True)
mean_quantity = X_test["mean_quantity_x"].values
X_test = X_test.drop(columns="mean_quantity_x")

y_train = y[test_len:]
y_test = y[:test_len]
y_test = y_test.mul(mean_quantity, axis=0).values

test_mask = np.where(y_test[:] != 0)
y_test = y_test[test_mask]

In [None]:
def get_model() -> Sequential:
    m = Sequential()
    m.add(layers.Dense(64, activation="relu", input_shape=(X_train.shape[-1],)))
    m.add(layers.Dense(30, activation="linear"))

    m.compile(optimizer="adam", loss="mean_squared_error")
    return m

epochs = 320
batch_size = 84

In [None]:
model = get_model()
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.15)
# ~ 3 min

In [None]:
y_pred = model.predict(X_test)
for i, q in enumerate(mean_quantity):
    y_pred[i] *= q
print(f"Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred[test_mask]))}")

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
&emsp; As you can see, the score is quite impressive. However, this achievement is based on fact that the training data contains the true mean quantity values.<br> 
&emsp; On the test dataset, with the predicted mean quantity column, the results may not be as favorable.<br>
</div>

In [None]:
final_model = get_model()
final_model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=0)  # ~ 3 min

In [None]:
test_predict = final_model.predict(test)
result_df = pd.DataFrame({"id": test_ids.flatten(), "quantity": test_predict.flatten()})
result_df = result_df.dropna()
result_df = result_df.merge(mean_quantity_df, how="left", on="id")
result_df.loc[:, "quantity"] *= result_df["mean_quantity_x"]
result_df.loc[result_df["quantity"] < 200, "quantity"] = 200

result_df.drop(columns="mean_quantity_x", inplace=True)
result_df = result_df.astype(np.int32)
result_df.to_csv("sample_submission.csv", index=False)

# ~ 290 score

<a id="6"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: darkgreen; border-bottom: 10px solid #099A9A; border-radius: 10px;"> CONCLUSION<br><div>  

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
<b>Next steps</b> <br>
1. Increase the mean quantity prediction metric.<br>
2. There are some cities with just one store. These cities can be processed separately.<br>
3. Experiment with alternative models for prediction.<br>
</div>