In [1]:
import glob
import logging
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances

currentDirectory = os.getcwd()

if not os.path.exists('data'):
    while currentDirectory != os.path.dirname(currentDirectory):
        parentDirectory = os.path.dirname(currentDirectory)

        if os.path.exists(os.path.join(parentDirectory, 'data')):
            os.chdir(parentDirectory)
            break

        currentDirectory = parentDirectory

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filePattern = 'data/store_1_*.csv'
receivingStoreInventory = pd.concat([pd.read_csv(f) for f in glob.glob(filePattern)], ignore_index=True)
receivingStoreInventory['Qty.'] = pd.to_numeric(receivingStoreInventory['Qty.'].fillna(0), errors='coerce')
receivingStoreInventory['Price'] = pd.to_numeric(receivingStoreInventory['Price'].fillna(0), errors='coerce')

sendingStoreInventory = pd.read_csv('data/store_2.csv')
sendingStoreInventory['Qty.'] = pd.to_numeric(sendingStoreInventory['Qty.'].fillna(0), errors='coerce')
sendingStoreInventory = sendingStoreInventory.loc[sendingStoreInventory['Qty.'] > 0]

  sendingStoreInventory = pd.read_csv('data/store_2.csv')


# Data cleaning

In [3]:
def checkDuplicatesColumnDifferences(df: pd.DataFrame, columnToCheck: str, columnDuplicated: str = 'Item') -> tuple[int, list[str]]:
    sortedDuplicates = df.loc[df.duplicated(subset=[columnDuplicated], keep=False)].sort_values('Item')
    differentPrices = []

    for duplicate in sortedDuplicates[columnDuplicated].unique():
        duplicateData = sortedDuplicates.loc[sortedDuplicates[columnDuplicated] == duplicate]
        uniquePrices = duplicateData[columnToCheck].nunique()

        if uniquePrices > 1:
            differentPrices.append(duplicate)
            
    logging.info(f'Duplicates with different "{columnToCheck}" #{len(differentPrices)}')
    return len(differentPrices), differentPrices


def mergeDuplicates(df: pd.DataFrame, columnsToMerge: dict[str, str], columnDuplicated: list[str] = ['Item']) -> pd.DataFrame:
    logging.info(f'Duplicate item #{df.duplicated(subset=columnDuplicated).sum()}')
    mergedDuplicates = df.groupby(columnDuplicated).agg(columnsToMerge).reset_index()
    df = pd.merge(df.drop_duplicates(subset=columnDuplicated, keep = 'first').drop(columns=list(columnsToMerge.keys())), mergedDuplicates,
                  on=columnDuplicated,
                  how='left')
    
    logging.info(f'Duplicate item #{df.duplicated(subset=columnDuplicated).sum()}')
    return df

In [4]:
# Sending store
numberOfDuplicatesPrices, differentPrices = checkDuplicatesColumnDifferences(receivingStoreInventory, 'Price')
numberOfDuplicatesUPC, differentUPC = checkDuplicatesColumnDifferences(receivingStoreInventory, 'UPC')
receivingStoreInventory = mergeDuplicates(receivingStoreInventory, {'Qty.': 'sum', 'Price': 'mean', 'UPC': 'first'})
receivingStoreInventory.to_csv('data/receivingStoreInventory.csv', index=False)

# Receiving store
sendingStoreInventory = mergeDuplicates(sendingStoreInventory, {'Qty.': 'sum'})
receivingStoreInventory.to_csv('data/sendingStoreInventory.csv', index=False)

2025-07-10 00:42:26 - INFO - Duplicates with different "Price" #0
2025-07-10 00:42:26 - INFO - Duplicates with different "UPC" #70
2025-07-10 00:42:26 - INFO - Duplicate item #653
2025-07-10 00:42:26 - INFO - Duplicate item #0
2025-07-10 00:42:27 - INFO - Duplicate item #8
2025-07-10 00:42:27 - INFO - Duplicate item #0


# Matching algorithm

In [5]:
receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')

  receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
  sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')


## Embeddings creation

In [6]:
def createEmbeddings(itemNames: list[str], modelNameHF: str = 'all-MiniLM-L6-v2') -> list[list[float]]:
    model = SentenceTransformer(modelNameHF)
    embeddings = model.encode(itemNames)
    
    return embeddings.tolist()

In [7]:
# Receiving store
receivingStoreInventory = receivingStoreInventory.loc[(receivingStoreInventory['Item'].notna()) &
                                                      (receivingStoreInventory['Item'].str.strip() != '')].reset_index(drop=True)
receivingStoreInventoryEmbeddingsDF = pd.DataFrame({
    'Item': receivingStoreInventory['Item'].tolist(),
    'Embeddings': createEmbeddings(receivingStoreInventory['Item'].tolist())
})
receivingStoreInventory = pd.merge(receivingStoreInventory, receivingStoreInventoryEmbeddingsDF,
                                   on=['Item'],
                                   how='left')
receivingStoreInventory.to_csv('data/receivingStoreInventory.csv', index=False)

# Sending store
sendingStoreInventory = sendingStoreInventory.loc[(sendingStoreInventory['Item'].notna()) &
                                                  (sendingStoreInventory['Item'].str.strip() != '')].reset_index(drop=True)
sendingStoreInventoryEmbeddingsDF = pd.DataFrame({
    'Item': sendingStoreInventory['Item'].tolist(),
    'Embeddings': createEmbeddings(sendingStoreInventory['Item'].tolist())
})
sendingStoreInventory = pd.merge(sendingStoreInventory, sendingStoreInventoryEmbeddingsDF,
                                 on=['Item'],
                                 how='left')
receivingStoreInventory.to_csv('data/sendingStoreInventory.csv', index=False)

2025-07-10 00:42:28 - INFO - Use pytorch device_name: mps
2025-07-10 00:42:28 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 4123/4123 [01:05<00:00, 63.01it/s]
2025-07-10 00:44:06 - INFO - Use pytorch device_name: mps
2025-07-10 00:44:06 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 4123/4123 [01:08<00:00, 60.57it/s]


## Distance matrix

In [8]:
receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')

  receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
  sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')


In [None]:
receivingStoreEmbeddings = np.vstack(receivingStoreInventoryEmbeddingsDF['Embeddings'].values)
sendingStoreEmbeddings = np.vstack(sendingStoreInventoryEmbeddingsDF['Embeddings'].values)
distanceMatrix = cosine_distances(receivingStoreEmbeddings, sendingStoreEmbeddings)
np.save('data/distanceMatrix.npy', distanceMatrix)

## Matrix analysis

In [None]:
receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')
distanceMatrix = np.load('data/distanceMatrix.npy')