In [None]:
import glob
import logging
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity


currentDirectory = os.getcwd()

if not os.path.exists('data'):
    while currentDirectory != os.path.dirname(currentDirectory):
        parentDirectory = os.path.dirname(currentDirectory)

        if os.path.exists(os.path.join(parentDirectory, 'data')):
            os.chdir(parentDirectory)
            break

        currentDirectory = parentDirectory

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [None]:
filePattern = 'data/store_1_*.csv'
receivingStoreInventory = pd.concat([pd.read_csv(f) for f in glob.glob(filePattern)], ignore_index=True)
receivingStoreInventory['Qty.'] = pd.to_numeric(receivingStoreInventory['Qty.'].fillna(0), errors='coerce')
receivingStoreInventory['Price'] = pd.to_numeric(receivingStoreInventory['Price'].fillna(0), errors='coerce')

sendingStoreInventory = pd.read_csv('data/store_2.csv')
sendingStoreInventory['Qty.'] = pd.to_numeric(sendingStoreInventory['Qty.'].fillna(0), errors='coerce')
sendingStoreInventory = sendingStoreInventory.loc[sendingStoreInventory['Qty.'] > 0]

# Data cleaning

In [None]:
def checkDuplicatesColumnDifferences(df: pd.DataFrame, columnToCheck: str, columnDuplicated: str = 'Item') -> tuple[int, list[str]]:
    sortedDuplicates = df.loc[df.duplicated(subset=[columnDuplicated], keep=False)].sort_values('Item')
    differentPrices = []

    for duplicate in sortedDuplicates[columnDuplicated].unique():
        duplicateData = sortedDuplicates.loc[sortedDuplicates[columnDuplicated] == duplicate]
        uniquePrices = duplicateData[columnToCheck].nunique()

        if uniquePrices > 1:
            differentPrices.append(duplicate)
            
    logging.info(f'Duplicates with different "{columnToCheck}" #{len(differentPrices)}')
    return len(differentPrices), differentPrices


def mergeDuplicates(df: pd.DataFrame, columnsToMerge: dict[str, str], columnDuplicated: list[str] = ['Item']) -> pd.DataFrame:
    logging.info(f'Duplicate item #{df.duplicated(subset=columnDuplicated).sum()}')
    mergedDuplicates = df.groupby(columnDuplicated).agg(columnsToMerge).reset_index()
    df = pd.merge(df.drop_duplicates(subset=columnDuplicated, keep = 'first').drop(columns=list(columnsToMerge.keys())), mergedDuplicates,
                  on=columnDuplicated,
                  how='left')
    
    logging.info(f'Duplicate item #{df.duplicated(subset=columnDuplicated).sum()}')
    return df

In [None]:
# Sending store
numberOfDuplicatesPrices, differentPrices = checkDuplicatesColumnDifferences(receivingStoreInventory, 'Price')
numberOfDuplicatesUPC, differentUPC = checkDuplicatesColumnDifferences(receivingStoreInventory, 'UPC')
receivingStoreInventory = mergeDuplicates(receivingStoreInventory, {'Qty.': 'sum', 'Price': 'mean', 'UPC': 'first'})
receivingStoreInventory.to_csv('data/receivingStoreInventory.csv', index=False)

# Receiving store
sendingStoreInventory = mergeDuplicates(sendingStoreInventory, {'Qty.': 'sum'})
receivingStoreInventory.to_csv('data/sendingStoreInventory.csv', index=False)

# Matching algorithm

In [None]:
receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')

## Embeddings creation

In [None]:
def createEmbeddings(itemNames: list[str], modelNameHF: str = 'all-MiniLM-L6-v2') -> list[list[float]]:
    model = SentenceTransformer(modelNameHF)
    embeddings = model.encode(itemNames)
    
    return embeddings.tolist()

In [None]:
# Receiving store
receivingStoreInventory = receivingStoreInventory.loc[(receivingStoreInventory['Item'].notna()) &
                                                      (receivingStoreInventory['Item'].str.strip() != '')].reset_index(drop=True)
receivingStoreInventoryEmbeddingsDF = pd.DataFrame({
    'Item': receivingStoreInventory['Item'].tolist(),
    'Embeddings': createEmbeddings(receivingStoreInventory['Item'].tolist())
})
receivingStoreInventory = pd.merge(receivingStoreInventory, receivingStoreInventoryEmbeddingsDF,
                                   on=['Item'],
                                   how='left')
receivingStoreInventory.to_csv('data/receivingStoreInventory.csv', index=False)

# Sending store
sendingStoreInventory = sendingStoreInventory.loc[(sendingStoreInventory['Item'].notna()) &
                                                  (sendingStoreInventory['Item'].str.strip() != '')].reset_index(drop=True)
sendingStoreInventoryEmbeddingsDF = pd.DataFrame({
    'Item': sendingStoreInventory['Item'].tolist(),
    'Embeddings': createEmbeddings(sendingStoreInventory['Item'].tolist())
})
sendingStoreInventory = pd.merge(sendingStoreInventory, sendingStoreInventoryEmbeddingsDF,
                                 on=['Item'],
                                 how='left')
receivingStoreInventory.to_csv('data/sendingStoreInventory.csv', index=False)

## Distance matrix

In [None]:
receivingStoreInventory = pd.read_csv('data/receivingStoreInventory.csv')
sendingStoreInventory = pd.read_csv('data/sendingStoreInventory.csv')

### Reduction of items to compare (TF-IDF)

In [None]:
def getTopCandidatesTFIDF(sendingInventory: pd.DataFrame, receivingInventory: pd.DataFrame, topK: int = 50) -> dict:
    '''
    For each item in sending inventory, find the topK most similar items in receiving inventory using TF-IDF.
        
    Args:
        sendingInventory (pd.DataFrame): DataFrame containing items to be matched. Must have 'Item' column.
        receivingInventory (pd.DataFrame): DataFrame containing potential matches. Must have 'Item' column.
        topK (int, optional): Number of top similar items to return for each sending item. Defaults to 50.
    
    Returns:
        dict: Dictionary mapping sending item indices to lists of receiving item indices.
    '''
    sendingInventoryItems = sendingInventory['Item'].tolist()
    receivingInventoryItems = receivingInventory['Item'].tolist()
    
    logging.info(f'Creating TF-IDF matrix for all items...')
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=1,
        max_df=0.9
    )
    allItems = sendingInventoryItems + receivingInventoryItems
    matrixTFIDF = vectorizer.fit_transform(allItems)
    sendingInventoryTFIDF = matrixTFIDF[:len(sendingInventoryItems)]
    receivingInventoryTFIDF = matrixTFIDF[len(sendingInventoryItems):]

    logging.info(f'Calculating cosine similarity between sending and receiving inventory...')
    similarityMatrixtFIDF = cosine_similarity(sendingInventoryTFIDF, receivingInventoryTFIDF)
    candidatesSimilarity = {}
    
    for i, sendingInventoryIndex in enumerate(sendingInventory.index):
        similarities = similarityMatrixtFIDF[i]
        topCandidates = np.argsort(similarities)[::-1][:topK]
        receivingInventoryItemsIndices = receivingInventory.index[topCandidates].tolist()
        candidatesSimilarity[sendingInventoryIndex] = receivingInventoryItemsIndices

        logging.info(f'"{sendingInventory.loc[sendingInventoryIndex, "Item"]}" -> best match: "{receivingInventory.loc[receivingInventoryItemsIndices[0], "Item"]}" and {len(receivingInventoryItemsIndices) - 1} others')

    
    return candidatesSimilarity


matchingSendingInventoryCandidates = getTopCandidatesTFIDF(sendingStoreInventory, receivingStoreInventory, 50)

### Candidates' embeddings distances

In [None]:
def calculateEmbeddingDistancesForCandidates(sendingInventory: pd.DataFrame, receivingInventory: pd.DataFrame, candidatesDict: dict) -> dict:
    """
    Calculate embedding distances between sending items and their top TF-IDF candidates.
    
    Args:
        sendingInventory (pd.DataFrame): DataFrame containing sending items with embeddings
        receivingInventory (pd.DataFrame): DataFrame containing receiving items with embeddings  
        candidatesDict (dict): Dictionary mapping sending item indices to lists of receiving item indices
    
    Returns:
        dict: Dictionary mapping sending item indices to (receiving_indices, distances) tuples
    """
    results = {}
    
    for sendingIdx, receivingCandidates in candidatesDict.items():
        sendingItem = sendingInventory.loc[sendingIdx]
        sendingEmbedding = np.array(sendingItem['Embeddings'])
        receivingCandidatesDF = receivingInventory.loc[receivingCandidates]
        receivingEmbeddings = np.vstack(receivingCandidatesDF['Embeddings'].values)
        distances = cosine_distances([sendingEmbedding], receivingEmbeddings)[0]
        results[sendingIdx] = (receivingCandidates, distances)
        bestMatchIdx = np.argmin(distances)
        bestMatchItem = receivingCandidatesDF.iloc[bestMatchIdx]['Item']
        bestDistance = distances[bestMatchIdx]
        
        logging.info(f'"{sendingItem["Item"]}" -> best embedding match: "{bestMatchItem}" (distance: {bestDistance:.3f})')
    
    return results


embeddingDistances = calculateEmbeddingDistancesForCandidates(sendingStoreInventory, receivingStoreInventory, matchingSendingInventoryCandidates)

### Matching items

In [None]:
def assignOptimalMatches(embeddingDistances: dict, sendingInventory: pd.DataFrame, receivingInventory: pd.DataFrame) -> dict:
    """
    Assign optimal matches between sending and receiving items, handling conflicts by choosing the closest match.
    
    Args:
        embeddingDistances (dict): Dictionary from calculateEmbeddingDistancesForCandidates
        sendingInventory (pd.DataFrame): Sending items DataFrame
        receivingInventory (pd.DataFrame): Receiving items DataFrame
    
    Returns:
        dict: Dictionary mapping sending item indices to their best matched receiving item index
    """
    receivingToSending = {}
    
    for sendingIdx, (receivingIndices, distances) in embeddingDistances.items():
        for i, receivingIdx in enumerate(receivingIndices):
            distance = distances[i]

            if receivingIdx not in receivingToSending:
                receivingToSending[receivingIdx] = []
            
            receivingToSending[receivingIdx].append((sendingIdx, distance))
    
    finalMatches = {}
    receivingItemAssignments = {}
    
    for receivingIdx, candidates in receivingToSending.items():
        candidates.sort(key=lambda x: x[1])
        
        for sendingIdx, distance in candidates:
            if sendingIdx not in finalMatches:
                finalMatches[sendingIdx] = receivingIdx
                receivingItemAssignments[receivingIdx] = sendingIdx
                break
    
    for sendingIdx, receivingIdx in finalMatches.items():
        sendingItem = sendingInventory.loc[sendingIdx]['Item']
        receivingItem = receivingInventory.loc[receivingIdx]['Item']
        receivingCandidates, distances = embeddingDistances[sendingIdx]
        candidateIdx = receivingCandidates.index(receivingIdx)
        distance = distances[candidateIdx]
        
        logging.info(f'MATCH: "{sendingItem}" -> "{receivingItem}" (distance: {distance:.3f})')
        
    return finalMatches


optimalMatches = assignOptimalMatches(embeddingDistances, sendingStoreInventory, receivingStoreInventory)