In [1]:
# libraries
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import urllib
from random import choices
from itertools import chain
# Levenshtein Distance in Python
import textdistance
import re
import unicodedata
from rank_bm25 import BM25Okapi, BM25L
# https://github.com/seatgeek/thefuzz
from thefuzz import fuzz, process

# Matplotlib configuration
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 16 }
plt.rc('font', **font)

# Pandas config
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# set random seed
np.random.seed(seed=42)

### Offers training

In [3]:
offers_training_df = pd.read_parquet('offers_training.parquet')
offers_test_df = pd.read_parquet('offers_test.parquet')

## Brand analysis

### Brand text processing
- Lower case text
- accented vowels removal `è, é, ... -> e`

In [4]:
class TextTransformer:
    def __init__(self, text):
        self.text = text
    def processed_text(self):
        #lower
        processed = self.text.lower()
        #remove accents
        processed = self.simplify(processed)
        #remove special characters
        processed = ''.join(c if c.isalnum() or c == ' ' else ' ' for c in processed)
        #remove unnecessary double spaces
        processed = re.sub(' +', ' ', processed)
        #strip
        processed = processed.strip()
        return processed
    
    def simplify(self, text):
        try:
            text = unicode(text, 'utf-8')
        except NameError:
            pass
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
        return str(text)

### Class for brand management

In [5]:
brands_training = offers_training_df['brand'].unique()
brands_test = offers_test_df['brand'].unique()

In [6]:
def similarity(str_1, str_2):
    return textdistance.levenshtein.normalized_similarity(str_1, str_2)

In [7]:
class Brand:
    def __init__(self, name, parent=None, child=list()):
        self.name = name.lower().title()
        self.parent = parent
        self.child = child
    
    def processed_text(self):
        return TextTransformer(self.name).processed_text()

In [8]:
class BrandCollection:
    def __init__(self, brand_list):
        self.brands = {}
        self.brand_family = {}
        brand_list.sort()
        for el in brand_list:
            self.process_brand(Brand(el))
    
    def process_brand(self, brand):
        if brand.processed_text() not in self.brands:
            parent_likelyhood = 0
            likely_parent = None
            for k in self.brands.keys():
                comp = self.brands[k]
                l_comp = self.listify(comp, brand)
                l_brand = self.listify(brand, comp)
                calc_likelyhood = self.parent_likelyhood(l_comp, l_brand)
                if calc_likelyhood > parent_likelyhood:
                    parent_likelyhood = calc_likelyhood
                    brand.parent = comp
                    likely_parent = self.brands[comp.processed_text()]
            self.brands[brand.processed_text()] = brand
            self.brand_family[brand] = []
            if likely_parent is not None:
                self.brand_family[likely_parent].append(brand)
    
    def listify(self, brand, to_compare):
        l_brand = brand.processed_text().split()
        l_comp = to_compare.processed_text().split()
        max_len = len(l_brand) if len(l_brand) > len(l_comp) else len(l_comp)
        if len(l_brand) == max_len:
            return l_brand
        for i in range(max_len - len(l_brand)):
            l_brand += ['']
        return l_brand
    
    def parent_likelyhood(self, l_comp, l_brand):
        likelyhood = 0
        #check longest matching n-gram, does not check for combinations
        for i in range(len(l_comp)):
            if l_comp[i] == l_brand[i]:
                likelyhood += 1
            else:
                return likelyhood
        return likelyhood
    
    def similarity(self, str_1, str_2):
        return textdistance.levenshtein.normalized_similarity(str_1, str_2)
    
    def get_match(self, brand_query):
        brand_to_search = Brand(brand_query)
        if brand_to_search.processed_text() in self.brands:
            brands = self.get_brand_family(brand_to_search.processed_text())
            return brands, 1
        else:
            relevance = 0
            most_relevant = '-'
            for key in self.brands.keys():
                sim = self.similarity(brand_to_search.processed_text(), self.brands[key].processed_text())
                if sim > relevance:
                    relevance = sim
                    most_relevant = self.brands[key].processed_text()
            brands = self.get_brand_family(most_relevant)
            return brands, relevance
            
    def get_brand_family(self, brand_name):
        fam_list = [brand_name]
        family = []
        while len(fam_list) != 0:
            current = fam_list[0]
            fam_list += [b.processed_text() for b in self.brand_family[self.brands[current]]]
            fam_list.remove(current)
            if current not in family:
                family += [current]
        return family             

In [9]:
bc = BrandCollection(list(brands_test))

In [10]:
for el in list(list(brands_test)):
    bcr, r = bc.get_match(el)
    #print(el, bcr, r)
bc.get_match('michael michael kors')

(['michael michael kors', 'michael kors', 'michael kors plus'], 1)

### Product name matching VSM - BM25

In [11]:
class TextRelevance:
    def __init__(self, text, relevance):
        self.text = text
        self.relevance = relevance
    def __repr__(self):
        return self.text + ' ' + str(self.relevance)
    def __eq__(self, other):
        return self.text == other.text
    def __hash__(self):
        return hash(self.text)

In [18]:
class BM25Z(BM25L):
    def __init__(self, corpus):
        super().__init__(self.process_list(corpus))
        self.corpus = self.process_list(corpus)
    def process_list(self, titles_list):
        return [self.processed_text(text).split() for text in titles_list]
    def processed_text(self, text):
        return TextTransformer(text).processed_text()
    def get_corpus_str(self):
        return [' '.join(el) for el in self.corpus]
    def get_corpus_scores(self, query):
        query = self.processed_text(query).split()
        return [TextRelevance(x,y) for x, y in zip(self.get_corpus_str(), self.get_scores(query))]
    def get_relevant_results(self, query):
        res = [el for el in self.get_corpus_scores(query) if el.relevance > 0]
        res = sorted(res, key=lambda x: x.relevance, reverse=True)
        return res

In [30]:
zalando_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'zalando']
aboutyou_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'aboutyou']
fields = ['shop','offer_id', 'brand', 'title']
#aboutyou_prod_training[fields]
brand_collection = BrandCollection(list(zalando_prod_training['brand'].unique()))
#brand_collection.get_match('pieces')

def product_likely_matches(n):
    """
    n is the row number of the product in aboutyou dataset
    try to compare 1 zalando product with aboutyou products
    """
    try:
        prod_to_match = zalando_prod_training.loc[n]
        id_prod = prod_to_match['offer_id']
    except:
        return {'zalando_id' : None, 'aboutyou_ids' : []}
    #get subset of possible brands to investigate
    match_brands, rel = brand_collection.get_match(prod_to_match['brand'])
    aboutyou_prod_brand_match = aboutyou_prod_training\
                                .loc[aboutyou_prod_training['brand']\
                                .apply(lambda x: TextTransformer(x).processed_text())\
                                .isin(match_brands)]
    
    if len(aboutyou_prod_brand_match['title']) == 0:
        return {'zalando_id' : id_prod, 'aboutyou_ids' : []}
    
    bm25matcher = BM25Z(list(aboutyou_prod_brand_match['title']))
    match_titles_relevance = list(dict.fromkeys(bm25matcher.get_relevant_results(prod_to_match['title'])))
    match_titles = [x.text for x in match_titles_relevance]

    aboutyou_prod_title_match = aboutyou_prod_brand_match\
                                .loc[aboutyou_prod_brand_match['title']\
                                .apply(lambda x: TextTransformer(x).processed_text())\
                                .isin(match_titles)]
    return {'zalando_id' : id_prod, 'aboutyou_ids' : list(aboutyou_prod_title_match['offer_id'])}

def get_n_product_matches(n=10):
    likely_matches = []
    i = 0
    while len(likely_matches) != n:
        m = product_likely_matches(i)
        if m['zalando_id'] != None:
            likely_matches += [product_likely_matches(i)]
        i += 1
    return likely_matches

get_n_product_matches()


[{'zalando_id': '02df5ca3-8adc-48fa-bf42-91b41c3ea5a9',
  'aboutyou_ids': ['5eb467d8-6141-4c19-9df4-c6a281469f7a',
   '13402054-b713-46ab-9a19-9688f9f824a1',
   'af7760eb-9494-4a83-8a55-48c9b0c8d21c',
   'cd9ab6ae-04a8-4378-ac6d-286fd5df85ed',
   'f3985e4e-876c-49c7-95e3-fc844802ddb5',
   '01999550-b2b9-4e1b-97c9-b2ad3f8fc61b',
   '3a95f076-d24c-49d9-b0dd-5701d1fa7d8e',
   'd2a0a64c-7498-4fe5-b3bc-f6af227e8590']},
 {'zalando_id': '08c47691-4160-41df-81c5-ea108f2ae539',
  'aboutyou_ids': ['50053f97-20b8-48df-ab3a-d10669f4b998',
   'e74ef2b0-1e7a-45ea-ae05-b04a2c1fdb57',
   '5e5819b9-6a1d-4da2-95be-7243e98e8a77',
   '58e014ba-406e-44ca-82af-49ed8a31d6c5',
   '41ed05a3-7d0e-402e-99b6-c4fa8327eb44',
   '9472e50f-96e5-4fbb-a3e2-fc3ae0a17e74',
   'eea5ec80-ea2f-423b-9e06-492b481b496e',
   'a697aa34-0a7f-4a42-bba1-a2444789c6fc',
   '7bc3bfe2-767d-4991-a5a1-f5b7710696cf',
   'b384f45a-c633-486f-834a-4d419ab5ce76',
   '8deeac4c-c28c-45ad-a660-0dcd963516b6',
   'd37743e5-6de6-4ecd-9b4d-ff1aad7df