### Imports

In [1]:
# libraries
import pandas as pd
import unicodedata
import re
from rank_bm25 import BM25L

# Pandas config
pd.options.mode.chained_assignment = None  # default='warn'

### Load data

In [2]:
title_not_matched = pd.read_parquet('title_not_matched.parquet')
offers_training_df = pd.read_parquet('offers_training.parquet')
offers_test_df = pd.read_parquet('offers_test.parquet')
matches_training_df = pd.read_parquet('matches_training.parquet')

### Process text for data cleaning

In [3]:
stop_word_list = []
with open('german_stopwords.txt', "r") as f:
    stop_word_list = f.read().split()
class TextTransformer:
    def processed_text(self, text):
        if text is None:
            return ''
        #lower
        processed = text.lower()
        #remove accents
        processed = self.simplify(processed)
        #remove special characters
        processed = ''.join(c if c.isalnum() or c == ' ' else ' ' for c in processed)
        #remove unnecessary double spaces
        processed = re.sub(' +', ' ', processed)
        #strip
        processed = processed.strip()
        #remove stopwords
        processed_list = [word for word in processed.split() if word not in stop_word_list]
        return ' '.join(processed_list)
    
    def simplify(self, text):
        try:
            text = unicode(text, 'utf-8')
        except NameError:
            pass
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
        return str(text)
with open('color_processing/kfix_de.txt') as f:
    color_kfix_list = f.read().splitlines() 

with open('color_processing/primary_colors_de.txt') as f:
    primary_colors = f.read().splitlines() 
color_matches = {}
with open('color_processing/color_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        color_matches[key] = val
with open('title_processing/kfix_de.txt') as f:
    clothes_kfix_list = f.read().splitlines() 

with open('title_processing/primary_clothes_de.txt') as f:
    primary_clothes = f.read().splitlines() 
clothes_matches = {}
with open('title_processing/clothes_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        clothes_matches[key] = val
class SpecificTrasformer(TextTransformer):
    def __init__(self,
                 kfix_list, 
                 matches, 
                 primary):
        self.kfix_list = kfix_list
        self.matches = matches
        self.primary = primary
    def separe_word(self, text, word):
        i = text.find(word)
        text = text[:i] + ' ' + text[i:] if i != -1 else text
        return text[:i+len(word)+1] + ' ' + text[i+len(word)+1:] if i != -1 else text
    def separe_words(self, text):
        for color in self.primary:
            text = self.separe_word(text, color)
        return text
    def replace_words(self, text):
        for k, v in self.matches.items():
            text = text.replace(k, v)
        return text
    def remove_kfix(self, text):
        for suffix in self.kfix_list:
            text = text.replace(suffix, '')
        return text
    
    def processed_text(self, text):
        splitted = super().processed_text(text).split()
        #1 transform matches
        splitted = [self.replace_words(text) for text in splitted]
        #2 suffix removal
        splitted = [self.remove_kfix(text) for text in splitted]
        #separate primary colors
        splitted = [self.separe_words(term) for term in splitted]
        return re.sub(' +', ' ', ' '.join(splitted).strip())

### Instance of text transformers

In [4]:
tt = TextTransformer()
ct = SpecificTrasformer(color_kfix_list, color_matches, primary_colors)
titlet = SpecificTrasformer(clothes_kfix_list, clothes_matches, primary_clothes)

### Clean DFs

In [None]:
zalando_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'zalando']
zalando_prod_training = zalando_prod_training\
                    .loc[zalando_prod_training['offer_id'].isin(matches_training_df['zalando'])]
zalando_prod_training['brand'] = zalando_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
zalando_prod_training['title'] = zalando_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
zalando_prod_training['color'] = zalando_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

aboutyou_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'aboutyou']
aboutyou_prod_training['brand'] = aboutyou_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
aboutyou_prod_training['title'] = aboutyou_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
aboutyou_prod_training['color'] = aboutyou_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

### BM25 setup

In [None]:
class TextRelevance:
    def __init__(self, text, relevance):
        self.text = text
        self.relevance = relevance
    def __repr__(self):
        return self.text + ' ' + str(self.relevance)
    def __eq__(self, other):
        return self.text == other.text
    def __hash__(self):
        return hash(self.text)

In [None]:
class BM25Z(BM25L):
    def __init__(self, corpus):
        super().__init__(self.process_list(corpus))
        self.corpus = self.process_list(corpus)
    def process_list(self, titles_list):
        return [text.split() for text in titles_list if text is not None]
    def processed_text(self, text):
        return tt.processed_text(text)
    def get_corpus_str(self):
        return [' '.join(el) for el in self.corpus]
    def get_corpus_scores(self, query):
        query = self.processed_text(query).split()
        return [TextRelevance(x,y) for x, y in zip(self.get_corpus_str(), self.get_scores(query))]
    def get_relevant_results(self, query, threshold=0):
        res = [el for el in self.get_corpus_scores(query) if el.relevance > threshold]
        res = sorted(res, key=lambda x: x.relevance, reverse=True)
        return res

### Getting all non matching titles

In [None]:
title_not_matching_offers = aboutyou_prod_training.loc[
    aboutyou_prod_training['offer_id'].isin(title_not_matched['aboutyou'])
]
bm25matcher_title = BM25Z(list(title_not_matching_offers['title']))

### Identify common substrings

In [None]:
def longest_substring(str_1, str_2):
    to_split = str_1 if len(str_1) < len(str_2) else str_2
    to_comp = str_1 if len(str_1) > len(str_2) else str_2
    if len(to_split) < 3 or len(to_comp) < 3:
        return None
    sub = all_substr(to_split)
    for el in sub:
        if el in to_comp:
            return el
    return None
        
def next_sub_substr(to_split):
    l = len(to_split)
    return to_split[:l-1] 

def next_sum_substr(to_split):
    l = len(to_split)
    return to_split[1:] 

def all_sub_substr(to_split):
    l = [to_split]
    for i in range(len(to_split) - 3):
        to_split = next_sub_substr(to_split)
        l += [to_split]
    return l

def all_substr(to_split):
    l = all_sub_substr(to_split)
    size = len(to_split) - 3
    for i in range(size):
        to_split = next_sum_substr(to_split)
        l += all_sub_substr(to_split)
    return sorted(l, key=len, reverse=True)

### Mine common substrings
e.g. Canvasgürtel, Ledergürtel -> gürtel

In [None]:
def mine_common_terms():
    common_terms = []
    for i in range(len(title_not_matched)):
        z_id = title_not_matched.loc[i]['zalando']
        a_id = title_not_matched.loc[i]['aboutyou']
        z_ti = zalando_prod_training.loc[zalando_prod_training['offer_id'] == z_id]['title'].values[0]
        a_ti = aboutyou_prod_training.loc[aboutyou_prod_training['offer_id'] == a_id]['title'].values[0]
        rel_titles = [el.text for el in bm25matcher_title.get_relevant_results(z_ti)]
        if a_ti not in rel_titles:
            com = longest_substring(z_ti, a_ti)
            common_terms += [com] if com is not None else []
    common_terms = [el for el in common_terms if ' ' not in el]
    filter_common_terms = []
    for el in common_terms:
        for term in common_terms:
            if el in term and len(el) < len(term):
                filter_common_terms += [el]
    to_rem = set(filter_common_terms)
    common_terms = [el for el in common_terms if el not in to_rem]
    return set(common_terms)

In [None]:
common_updated = mine_common_terms()

### These terms will be added to a list to read from
primary_clothes_de.txt

In [None]:
for term in common_updated:
    print(term)