### Imports

In [1]:
# libraries
import pandas as pd
import unicodedata
import re
from rank_bm25 import BM25L

# Pandas config
pd.options.mode.chained_assignment = None  # default='warn'

### Load data

In [2]:
title_not_matched = pd.read_parquet('data/title_not_matched.parquet')
color_not_matched = pd.read_parquet('data/color_not_matched.parquet')

offers_training_df = pd.read_parquet('../data/offers_training.parquet')
offers_test_df = pd.read_parquet('../data/offers_test.parquet')
matches_training_df = pd.read_parquet('../data/matches_training.parquet')

### Process text for data cleaning

In [3]:
stop_word_list = []
with open('./text_processing/german_stopwords.txt', "r") as f:
    stop_word_list = f.read().split()
class TextTransformer:
    def processed_text(self, text):
        if text is None:
            return ''
        #lower
        processed = text.lower()
        #remove accents
        processed = self.simplify(processed)
        #remove special characters
        processed = ''.join(c if c.isalnum() or c == ' ' else ' ' for c in processed)
        #remove unnecessary double spaces
        processed = re.sub(' +', ' ', processed)
        #strip
        processed = processed.strip()
        #remove stopwords
        processed_list = [word for word in processed.split() if word not in stop_word_list]
        return ' '.join(processed_list)
    
    def simplify(self, text):
        try:
            text = unicode(text, 'utf-8')
        except NameError:
            pass
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
        return str(text)
with open('color_processing/kfix_de.txt') as f:
    color_kfix_list = f.read().splitlines() 

with open('color_processing/primary_colors_de.txt') as f:
    primary_colors = f.read().splitlines() 
color_matches = {}
with open('color_processing/color_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        color_matches[key] = val
with open('title_processing/kfix_de.txt') as f:
    clothes_kfix_list = f.read().splitlines() 

with open('title_processing/primary_clothes_de.txt') as f:
    primary_clothes = f.read().splitlines() 
clothes_matches = {}
with open('title_processing/clothes_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        clothes_matches[key] = val
class SpecificTrasformer(TextTransformer):
    def __init__(self,
                 kfix_list, 
                 matches, 
                 primary):
        self.kfix_list = kfix_list
        self.matches = matches
        self.primary = primary
    def separe_word(self, text, word):
        i = text.find(word)
        text = text[:i] + ' ' + text[i:] if i != -1 else text
        return text[:i+len(word)+1] + ' ' + text[i+len(word)+1:] if i != -1 else text
    def separe_words(self, text):
        for color in self.primary:
            text = self.separe_word(text, color)
        return text
    def replace_words(self, text):
        for k, v in self.matches.items():
            text = text.replace(k, v)
        return text
    def remove_kfix(self, text):
        for suffix in self.kfix_list:
            text = text.replace(suffix, '')
        return text
    
    def processed_text(self, text):
        splitted = super().processed_text(text).split()
        #1 transform matches
        splitted = [self.replace_words(text) for text in splitted]
        #2 suffix removal
        splitted = [self.remove_kfix(text) for text in splitted]
        #separate primary colors
        splitted = [self.separe_words(term) for term in splitted]
        return re.sub(' +', ' ', ' '.join(splitted).strip())

### Instance of text transformers

In [4]:
tt = TextTransformer()
ct = SpecificTrasformer(color_kfix_list, color_matches, primary_colors)
titlet = SpecificTrasformer(clothes_kfix_list, clothes_matches, primary_clothes)

### Clean DFs

In [5]:
zalando_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'zalando']
zalando_prod_training = zalando_prod_training\
                    .loc[zalando_prod_training['offer_id'].isin(matches_training_df['zalando'])]
zalando_prod_training['brand'] = zalando_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
zalando_prod_training['title'] = zalando_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
zalando_prod_training['color'] = zalando_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

aboutyou_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'aboutyou']
aboutyou_prod_training['brand'] = aboutyou_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
aboutyou_prod_training['title'] = aboutyou_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
aboutyou_prod_training['color'] = aboutyou_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

### BM25 setup

In [6]:
class TextRelevance:
    def __init__(self, text, relevance):
        self.text = text
        self.relevance = relevance
    def __repr__(self):
        return self.text + ' ' + str(self.relevance)
    def __eq__(self, other):
        return self.text == other.text
    def __hash__(self):
        return hash(self.text)

In [7]:
class BM25Z(BM25L):
    def __init__(self, corpus):
        super().__init__(self.process_list(corpus))
        self.corpus = self.process_list(corpus)
    def process_list(self, titles_list):
        return [text.split() for text in titles_list if text is not None]
    def processed_text(self, text):
        return tt.processed_text(text)
    def get_corpus_str(self):
        return [' '.join(el) for el in self.corpus]
    def get_corpus_scores(self, query):
        query = self.processed_text(query).split()
        return [TextRelevance(x,y) for x, y in zip(self.get_corpus_str(), self.get_scores(query))]
    def get_relevant_results(self, query, threshold=0):
        res = [el for el in self.get_corpus_scores(query) if el.relevance > threshold]
        res = sorted(res, key=lambda x: x.relevance, reverse=True)
        return res

### Getting all non matching titles

In [8]:
title_not_matching_offers = aboutyou_prod_training.loc[
    aboutyou_prod_training['offer_id'].isin(title_not_matched['aboutyou'])
]
bm25matcher_title = BM25Z(list(title_not_matching_offers['title']))

### Getting all non matching colors

In [9]:
color_not_matching_offers = aboutyou_prod_training.loc[
    aboutyou_prod_training['offer_id'].isin(color_not_matched['aboutyou'])
]
bm25matcher_color = BM25Z(list(color_not_matching_offers['color']))

### Identify common substrings

In [10]:
def longest_substring(str_1, str_2):
    to_split = str_1 if len(str_1) < len(str_2) else str_2
    to_comp = str_1 if len(str_1) > len(str_2) else str_2
    if len(to_split) < 3 or len(to_comp) < 3:
        return None
    sub = all_substr(to_split)
    for el in sub:
        if el in to_comp:
            return el
    return None
        
def next_sub_substr(to_split):
    l = len(to_split)
    return to_split[:l-1] 

def next_sum_substr(to_split):
    l = len(to_split)
    return to_split[1:] 

def all_sub_substr(to_split):
    l = [to_split]
    for i in range(len(to_split) - 3):
        to_split = next_sub_substr(to_split)
        l += [to_split]
    return l

def all_substr(to_split):
    l = all_sub_substr(to_split)
    size = len(to_split) - 3
    for i in range(size):
        to_split = next_sum_substr(to_split)
        l += all_sub_substr(to_split)
    return sorted(l, key=len, reverse=True)

### Mine common substrings
e.g. Canvasgürtel, Ledergürtel -> gürtel

In [11]:
def mine_common_terms(not_match_df, matcher, attribute):
    common_terms = []
    for i in range(len(not_match_df)):
        z_id = not_match_df.loc[i]['zalando']
        a_id = not_match_df.loc[i]['aboutyou']
        z_ti = zalando_prod_training.loc[zalando_prod_training['offer_id'] == z_id][attribute].values[0]
        a_ti = aboutyou_prod_training.loc[aboutyou_prod_training['offer_id'] == a_id][attribute].values[0]
        rel_titles = [el.text for el in matcher.get_relevant_results(z_ti)]
        if a_ti not in rel_titles:
            com = longest_substring(z_ti, a_ti)
            common_terms += [com] if com is not None else []
    common_terms = [el for el in common_terms if ' ' not in el]
    filter_common_terms = []
    for el in common_terms:
        for term in common_terms:
            if el in term and len(el) < len(term):
                filter_common_terms += [el]
    to_rem = set(filter_common_terms)
    common_terms = [el for el in common_terms if el not in to_rem]
    return set(common_terms)

In [12]:
title_common_updated = mine_common_terms(title_not_matched, bm25matcher_title, 'title')
color_common_updated = mine_common_terms(color_not_matched, bm25matcher_color, 'color')

### These terms will be added to a list to read from
primary_clothes_de.txt

In [13]:
for term in title_common_updated:
    print(term)

ack
ron
nte
rist
lad
tanga
lei
analoguhr
and
rte
ony
aurora
anzug
sch


### These terms will be added to a list to read from
primary_colors_de.txt

In [14]:
for term in color_common_updated:
    print(term)

ros
ran
aubergine
rau
sch


In [15]:
def print_attributes_values(z_id, a_id, attribute):
    z_a = zalando_prod_training\
                        .loc[zalando_prod_training['offer_id'] == z_id][attribute].values[0]
    a_a = aboutyou_prod_training\
                        .loc[aboutyou_prod_training['offer_id'] == a_id][attribute].values[0]
    print(z_a, '-', a_a)

In [29]:
for i in color_not_matched.index:
    z_id = color_not_matched.loc[i]['zalando']
    a_id = color_not_matched.loc[i]['aboutyou']
    print_attributes_values(z_id, a_id, 'color')
#matches_training_df.loc[0]['zalando']

grun - braun braun
braun - gelb gelb
grun - senf gelb
schwarz - dunkel grun grun
braun - wei wei
dunkel blau - schwarz schwarz
braun - wei wei
grau meliert - braun wei
dunkel blau - wei schwarz schwarz
schwarz - dunkel grun grun
braun - wei wei
grun - braun schwarz braun
schwarz - braun gold braun
rot - schwarz schwarz
gold - creme braun
dunkel grau - grun grun
braun - hell grau grau
schwarz - grau grau
wei - creme braun hell braun schwarz braun
grun - brokat braun
braun - hell grau grau
grau - schwarz schwarz
grun - aqua blau
schwarz blau - dunkel grau grau
braun - rosa rosa
definiert - alt rosa rosa
blau - misch misch
orange - rost braun braun
misch - blau blau cranberry blau
braun - schwarz wei schwarz wei
wei - braun braun
dunkel blau - misch misch
hell braun - taupe grau
orange - schwarz braun rot gelb braun
dunkel blau - schwarz schwarz
grun - braun braun
wei - transparent gold gold
grun - dunkel grau grau
rot - orange orange
definiert - wei
orange - rost braun braun
schwarz - mi

braun - melone rot
schwarz - basalt grau grau
misch - rosa wei hell blau blau wei rosa
misch - grau meliert dunkel grau jade hell blau grau
grun - greige grau
rosa - braun braun
wei - silber silber
braun - wei wei
braun - rosa rosa
rosa - kirsch rot rot
grun meliert - schwarz braun braun
wei - silber transparent silber
dunkel gelb - orange orange
grun - wei schwarz wei
braun - silber silber
braun - wei wei
wei - silber silber
rot - aubergine rosa
taupe - braun braun
rosa - hummer orange
braun - creme wei
senf - honig gelb
hell braun - silber grau schwarz grau
grun - aqua blau
misch - silber grun blau silber
grun - braun braun
braun - wei wei
rosa - braun braun
misch - creme rot grau grun wei
braun - rosa
senf - braun braun
misch - schwarz grau schwarz grau
braun - wei wei
braun - schwarz rosa grun misch
dunkel blau - grau grau
braun - wei wei
grun - braun braun
schwarz - grau meliert grau
braun ne - blau grun
definiert - transparent
braun - grun grun
braun - wei schwarz kitt wei
rot - 

schwarz - dunkel braun braun
misch - blau senf wei blau schwarz blau
senf - safran gelb
grun - taupe grau
wei - creme schwarz braun
rosa - hell blau blau
braun - wei wei
braun - schwarz blau schwarz
schwarz - dunkel grau misch braun
rot metallic - mokka braun
schwarz - dunkel grau grau grau
grun - creme rauch blau rot braun blau
blau - braun braun
rosa - grun grun
schwarz - wei braun braun
misch - schwarz grun curry apricot schwarz
grau - dunkel grun wei grun
wei - gold creme braun
hell grau meliert - wei wei
braun - hell gelb gelb
braun - schwarz wei wei
rosa - rot rot
grau - braun braun
misch - woll wei hell blau grun wei
braun - woll wei wei
braun - woll wei wei
blau - misch misch
rot - braun wei braun
definiert - transparent
braun - wei schwarz wei
dunkel grun - misch misch
misch - braun creme braun braun braun
misch - wei hell grun orange orange
rot - dunkel orange orange
rot - rosa rosa
grau - wei wei
dunkel grau - schwarz schwarz
braun - wei hell rosa hell blau wei
schwarz - gra

grun - dunkel blau blau
misch - wei blau blau
orange - braun braun
dunkel grau - schwarz blau schwarz
braun - orange rosa blau rosa
grun - hell braun braun
rosa - hell blau blau
rot - dunkel rosa rosa
grau - blau blau
stein - wei wei
misch - wei schwarz gelb rosa wei
braun - creme misch wei
misch - rosa blau rosa
grun - schwarz schwarz
schwarz - grau grau
hell braun - alt rosa wei rosa
grun - azur wei blau
rosa - misch misch
misch - braun braun braun
braun - hell grau schwarz rosa alt rosa misch
rosa - nacht blau wei rot blau
grau - nacht blau orange blau
dark blau blau - schwarz schwarz
grau metallic - schwarz wei taupe schwarz
definiert - wei
braun - silber silber
schwarz meliert - grau grau
misch - rosa blau blau rosa
ocker - braun gold braun
stein - dunkel grau nacht blau wei grau
misch - dunkel rosa grau meliert himbeer rosa braun rosa grau
rosa - braun misch braun
braun - wei schwarz wei
misch - transparent blau gold gold
grau - taupe rauch blau hell rosa senf braun blau rosa ora

hell grun - grau grau
grun - aqua blau
wei - silber gold silber gold
blau - schwarz orange wei rauch grau grau
braun - ecru wei
wei - hell blau blau
schwarz - misch misch
rosa - hell braun braun
schwarz - dunkel grun grun
hell braun - safran gelb
schwarz - braun hell braun braun
dunkel gelb meliert - braun braun
misch - creme braun
blau - rosa rosa
taupe - kitt braun
schwarz - braun wei braun
misch - dunkel rosa nacht blau hell blau rot orange blau
schwarz - wei wei misch
rot - dunkel rosa rosa
dunkel blau - grau meliert grau
hell grun - creme braun
grun - hell braun braun
blau - jade grun
misch - wei grun rosa dunkel rosa rosa
braun - woll wei wei
braun - wei wei
wei - hell gelb braun gelb
braun - wei grau misch wei
grun - hell braun braun
dunkel blau - schwarz rosa gelb grun misch
wei - creme braun
braun - woll wei wei
blau - braun grun wei
definiert - gold
rosa - grau rot kobalt blau wei n rot rot
mauve - braun braun
braun - wei wei
hell blau - braun grun wei
schwarz - silber silber

In [30]:
len(color_not_matched)

1568