In [24]:
# libraries
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import urllib
from random import choices
from itertools import chain
# Levenshtein Distance in Python
import textdistance
import re
import unicodedata
# https://github.com/seatgeek/thefuzz
from thefuzz import fuzz, process

# Matplotlib configuration
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 16 }
plt.rc('font', **font)

# Pandas config
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
# set random seed
np.random.seed(seed=42)

### Offers training

In [5]:
offers_training_df = pd.read_parquet('offers_training.parquet')

## Brand analysis

### Brand text processing
- Lower case text
- accented vowels removal `è, é, ... -> e`

In [87]:
def similarity(str_1, str_2):
    return textdistance.levenshtein.normalized_similarity(str_1, str_2)

In [27]:
brands_training = offers_training_df['brand'].unique()

#remove accents and lower text
brands = [simplify(b.lower()) for b in list(brands_training)]

#remove special characters
brands = [''.join(c if c.isalnum() or c == ' ' else ' ' for c in string) for string in brands]

#remove unnecessary double spaces
brands = [re.sub(' +', ' ', t) for t in brands]

#create a set
brands = set(brands)

#divide brand names in list of words
brands = [t.split() for t in brands]

brands

[['ted', 'baker'],
 ['nuance'],
 ['cras'],
 ['fritzi', 'aus', 'preuen'],
 ['vero', 'moda', 'tall'],
 ['hot', 'potatoes'],
 ['bruuns', 'bazaar'],
 ['free', 'people'],
 ['saucony'],
 ['davida', 'cashmere'],
 ['petite', 'fleur'],
 ['ellesse'],
 ['camano'],
 ['zarkoperfume'],
 ['swarovski'],
 ['pieces', 'maternity'],
 ['jacky', 'baby'],
 ['mennace'],
 ['vero', 'moda', 'petite'],
 ['little', 'pieces'],
 ['more', 'more'],
 ['huf'],
 ['jako'],
 ['vero', 'moda', 'curve'],
 ['selected', 'femme', 'petite'],
 ['happy', 'socks'],
 ['underprotection'],
 ['liu', 'jo'],
 ['garment', 'project'],
 ['jolana', 'fenena'],
 ['luhta'],
 ['lascana'],
 ['selected', 'femme', 'tall'],
 ['fuchs', 'schmitt'],
 ['guess'],
 ['bullboxer'],
 ['local', 'heroes'],
 ['liu', 'jo', 'jeans'],
 ['didriksons'],
 ['esme', 'studios'],
 ['denim', 'project'],
 ['envie', 'de', 'fraise'],
 ['see', 'by', 'chloe'],
 ['kaffe'],
 ['polaroid'],
 ['etam'],
 ['mamalicious'],
 ['vero', 'moda', 'maternity'],
 ['mother'],
 ['quiksilver'],
 

### Class for brand management

In [83]:
class Brand:
    def __init__(self, name, parent=None):
        self.name = name.lower().title()
        self.parent = parent
    
    def simplify(self, text):
        try:
            text = unicode(text, 'utf-8')
        except NameError:
            pass
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
        return str(text)
    
    def processed_text(self):
        #lower
        processed = self.name.lower()
        #remove accents
        processed = self.simplify(processed)
        #remove special characters
        processed = ''.join(c if c.isalnum() or c == ' ' else ' ' for c in processed)
        #remove unnecessary double spaces
        processed = re.sub(' +', ' ', processed)
        return processed
    
    def __str__(self):
        brand_str = 'name: ' + self.name
        if self.parent is not None:
            brand_str += ' -> parent: ' + str(self.parent)
        return brand_str

In [84]:
class BrandCollection:
    def __init__(self, brand_list):
        self.brands = {}
        for el in brand_list:
            self.process_brand(Brand(el))
    
    def process_brand(self, brand):
        if brand.processed_text() not in self.brands:
            parent_likelyhood = 0
            for k in self.brands.keys():
                comp = self.brands[k]
                l_comp = self.listify(comp, brand)
                l_brand = self.listify(brand, comp)
                calc_likelyhood = self.parent_likelyhood(l_comp, l_brand)
                if calc_likelyhood > parent_likelyhood:
                    parent_likelyhood = calc_likelyhood
                    brand.parent = comp
            self.brands[brand.processed_text()] = brand
    
    def listify(self, brand, to_compare):
        l_brand = brand.processed_text().split()
        l_comp = to_compare.processed_text().split()
        max_len = len(l_brand) if len(l_brand) > len(l_comp) else len(l_comp)
        if len(l_brand) == max_len:
            return l_brand
        for i in range(max_len - len(l_brand)):
            l_brand += ['']
        return l_brand
    
    def parent_likelyhood(self, l_comp, l_brand):
        likelyhood = 0
        #does not check for combinations
        for i in range(len(l_comp)):
            if l_comp[i] == l_brand[i]:
                likelyhood += 1
            else:
                return likelyhood
        return likelyhood

In [85]:
bc = BrandCollection(list(brands_training))

In [86]:
for k in bc.brands.keys():
    print(bc.brands[k])

name: Pieces
name: Lascana
name: Mamalicious
name: Rosemunde
name: Guess
name: Ellesse
name: Free People
name: Bullboxer
name: Selected
name: Kaffe
name: Zizzi
name: Jette
name: Vero Moda
name: Quiksilver
name: Herrlicher
name: Envie De Fraise
name: Saucony
name: Ag Jeans
name: Etam
name: More & More
name: Selected Homme -> parent: name: Selected
name: Jako
name: Panama Jack
name: Rich & Royal
name: Fritzi Aus Preußen
name: Gestuz
name: Petite Fleur
name: Liu Jo
name: Pieces Maternity -> parent: name: Pieces
name: Jolana & Fenena
name: Peak Performance
name: Colmar
name: Farah
name: Swarovski
name: Selected Femme -> parent: name: Selected
name: Denim Project
name: Mennace
name: Garment Project
name: Vivance
name: Verbenas
name: Selected Femme Petite -> parent: name: Selected Femme -> parent: name: Selected
name: Cras
name: Bree
name: Mother
name: Flip*Flop
name: Dockers
name: Libertine-Libertine
name: Vero Moda Curve -> parent: name: Vero Moda
name: Vero Moda Aware -> parent: name: Ver

In [90]:
similarity('a', 'bbbbbbbbbbbbbbbbbbbbba')

0.045454545454545414