### Imports

In [1]:
# libraries
import pandas as pd
import unicodedata
import re
from rank_bm25 import BM25L

# Pandas config
pd.options.mode.chained_assignment = None  # default='warn'

### Load data

In [2]:
offers_training_df = pd.read_parquet('../data/offers_training.parquet')
offers_test_df = pd.read_parquet('../data/offers_test.parquet')
matches_training_df = pd.read_parquet('../data/matches_training.parquet')

### Process text for data cleaning

In [3]:
stop_word_list = []
with open('./text_processing/german_stopwords.txt', "r") as f:
    stop_word_list = f.read().split()
class TextTransformer:
    def processed_text(self, text):
        if text is None:
            return ''
        #lower
        processed = text.lower()
        #remove accents
        processed = self.simplify(processed)
        #remove special characters
        processed = ''.join(c if c.isalnum() or c == ' ' else ' ' for c in processed)
        #remove unnecessary double spaces
        processed = re.sub(' +', ' ', processed)
        #strip
        processed = processed.strip()
        #remove stopwords
        processed_list = [word for word in processed.split() if word not in stop_word_list]
        return ' '.join(processed_list)
    
    def simplify(self, text):
        try:
            text = unicode(text, 'utf-8')
        except NameError:
            pass
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
        return str(text)
with open('color_processing/kfix_de.txt') as f:
    color_kfix_list = f.read().splitlines() 

with open('color_processing/primary_colors_de.txt') as f:
    primary_colors = f.read().splitlines() 
color_matches = {}
with open('color_processing/color_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        color_matches[key] = val
with open('title_processing/kfix_de.txt') as f:
    clothes_kfix_list = f.read().splitlines() 

with open('title_processing/primary_clothes_de.txt') as f:
    primary_clothes = f.read().splitlines() 
clothes_matches = {}
with open('title_processing/clothes_matches_de.txt') as f:
    for line in f:
        (key, val) = line.split(':')
        val = val.strip()
        clothes_matches[key] = val
class SpecificTrasformer(TextTransformer):
    def __init__(self,
                 kfix_list, 
                 matches, 
                 primary):
        self.kfix_list = kfix_list
        self.matches = matches
        self.primary = primary
    def separe_word(self, text, word):
        i = text.find(word)
        text = text[:i] + ' ' + text[i:] if i != -1 else text
        return text[:i+len(word)+1] + ' ' + text[i+len(word)+1:] if i != -1 else text
    def separe_words(self, text):
        for color in self.primary:
            text = self.separe_word(text, color)
        return text
    def replace_words(self, text):
        for k, v in self.matches.items():
            text = text.replace(k, v)
        return text
    def remove_kfix(self, text):
        for suffix in self.kfix_list:
            text = text.replace(suffix, '')
        return text
    
    def processed_text(self, text):
        splitted = super().processed_text(text).split()
        #1 transform matches
        splitted = [self.replace_words(text) for text in splitted]
        #2 suffix removal
        splitted = [self.remove_kfix(text) for text in splitted]
        #separate primary colors
        splitted = [self.separe_words(term) for term in splitted]
        return re.sub(' +', ' ', ' '.join(splitted).strip())

### Instance of text transformers

In [4]:
tt = TextTransformer()
ct = SpecificTrasformer(color_kfix_list, color_matches, primary_colors)
titlet = SpecificTrasformer(clothes_kfix_list, clothes_matches, primary_clothes)

### Clean DFs

In [5]:
zalando_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'zalando']
zalando_prod_training = zalando_prod_training\
                    .loc[zalando_prod_training['offer_id'].isin(matches_training_df['zalando'])]
zalando_prod_training['brand'] = zalando_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
zalando_prod_training['title'] = zalando_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
zalando_prod_training['color'] = zalando_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

aboutyou_prod_training = offers_training_df.loc[offers_training_df['shop'] == 'aboutyou']
aboutyou_prod_training['brand'] = aboutyou_prod_training['brand']\
                    .apply(lambda x: tt.processed_text(x))
aboutyou_prod_training['title'] = aboutyou_prod_training['title']\
                    .apply(lambda x: titlet.processed_text(x))
aboutyou_prod_training['color'] = aboutyou_prod_training['color']\
                    .apply(lambda x: ct.processed_text(x))

offer_test_clean = offers_test_df.copy()
offer_test_clean['title'] = offer_test_clean['title']\
                    .apply(lambda x: titlet.processed_text(x))
offer_test_clean['color'] = offer_test_clean['color']\
                    .apply(lambda x: ct.processed_text(x))

df_offers = pd.DataFrame(columns=['offer_id', 'shop', 'lang', 'brand', 'title', 'color'])
df_offers = df_offers.append(zalando_prod_training, ignore_index=True)
df_offers = df_offers.append(aboutyou_prod_training, ignore_index=True)

### BM25 setup

In [6]:
class TextRelevance:
    def __init__(self, text, relevance):
        self.text = text
        self.relevance = relevance
    def __repr__(self):
        return self.text + ' ' + str(self.relevance)
    def __eq__(self, other):
        return self.text == other.text
    def __hash__(self):
        return hash(self.text)

In [7]:
class BM25Z(BM25L):
    def __init__(self, corpus):
        super().__init__(self.process_list(corpus))
        self.corpus = self.process_list(corpus)
    def process_list(self, titles_list):
        return [text.split() for text in titles_list if text is not None]
    def processed_text(self, text):
        return tt.processed_text(text)
    def get_corpus_str(self):
        return [' '.join(el) for el in self.corpus]
    def get_corpus_scores(self, query):
        query = self.processed_text(query).split()
        return [TextRelevance(x,y) for x, y in zip(self.get_corpus_str(), self.get_scores(query))]
    def get_relevant_results(self, query, threshold=0):
        res = [el for el in self.get_corpus_scores(query) if el.relevance > threshold]
        res = sorted(res, key=lambda x: x.relevance, reverse=True)
        return res

### Test coverage of primary_clothes

In [8]:
#Set of all "clothes terms" mined
set_title_terms = set(primary_clothes)
#Set of all titles in DF
set_titles = set(list(zalando_prod_training['title'].values)\
                 + list(df_offers['title'].values)
                )

bm25_title = BM25Z(set_titles)

list_title_retrieved = []
for t in set_title_terms:
    list_title_retrieved += [t.text for t in bm25_title.get_relevant_results(t)]

In [9]:
set_title_retrieved = set(list_title_retrieved)
perc_belong = round(len(set_title_retrieved)/len(set_titles)*100,2)
n_product_checked = len(offer_test_clean) + len(df_offers)
print(f'{perc_belong}% of product titles belongs to bag of words mined')
print(f'{len(set_title_terms)} distinct terms are present in {round(n_product_checked * perc_belong / 100)} product titles')

93.35% of product titles belongs to bag of words mined
103 distinct terms are present in 171662 product titles


### Test coverage of primary_colors

In [10]:
#Set of all "color terms" mined
set_color_terms = set(primary_colors)
#Set of all titles in DF
set_colors = set(list(zalando_prod_training['color'].values)\
                 + list(df_offers['color'].values)
                )

bm25_color = BM25Z(set_colors)

list_color_retrieved = []
for t in set_color_terms:
    list_color_retrieved += [t.text for t in bm25_color.get_relevant_results(t)]

In [11]:
set_color_retrieved = set(list_color_retrieved)
perc_color_belong = round(len(set_color_retrieved)/len(set_colors)*100,2)
n_product_checked = len(offer_test_clean) + len(df_offers)
print(f'{perc_color_belong}% of product colors belongs to bag of words mined')
print(f'{len(set_color_terms)} distinct terms are present in {round(n_product_checked * perc_color_belong / 100)} product colors')

99.7% of product colors belongs to bag of words mined
16 distinct terms are present in 183339 product colors


### Create title categories

In [12]:
def create_category(record, bag_of_terms, attribute):
    l = []
    for term in bag_of_terms:
        if term in record[attribute]:
            c = record[attribute].count(term)
            l += [term + str(c)]
    return '-'.join(l)

In [13]:
title_categories = []
for i in df_offers.index:
    title_categories += [create_category(df_offers.loc[i], set_title_terms, 'title')]

In [14]:
title_enumeration = {}
t_i = 0
for title in set(title_categories):
    title_enumeration[title] = t_i
    t_i += 1

### Create color categories

In [15]:
color_categories = []
for i in df_offers.index:
    color_categories += [create_category(df_offers.loc[i], set_color_terms, 'color')]

In [16]:
color_enumeration = {}
c_i = 0
for title in set(color_categories):
    color_enumeration[title] = c_i
    c_i += 1

### Brand analysis

In [17]:
class Brand:
    def __init__(self, name, parent=None, child=list()):
        self.name = name
        self.parent = parent
        self.child = child
        
class BrandCollection:
    def __init__(self, brand_list):
        self.brands = {}
        self.brand_family = {}
        self.manual_matches = {}
        #brand_list = [TextTransformer(x).processed_text() for x in brand_list]
        brand_list.sort()
        for el in brand_list:
            self.process_brand(Brand(el))
    
    def process_brand(self, brand):
        if brand.name not in self.brands:
            parent_likelyhood = 0
            likely_parent = None
            for k in self.brands.keys():
                comp = self.brands[k]
                l_comp = self.listify(comp, brand)
                l_brand = self.listify(brand, comp)
                calc_likelyhood = self.parent_likelyhood(l_comp, l_brand)
                if calc_likelyhood > parent_likelyhood:
                    parent_likelyhood = calc_likelyhood
                    brand.parent = comp
                    likely_parent = self.brands[comp.name]
            self.brands[brand.name] = brand
            self.brand_family[brand] = []
            if likely_parent is not None:
                self.brand_family[likely_parent].append(brand)
    
    def listify(self, brand, to_compare):
        l_brand = brand.name.split()
        l_comp = to_compare.name.split()
        max_len = len(l_brand) if len(l_brand) > len(l_comp) else len(l_comp)
        if len(l_brand) == max_len:
            return l_brand
        for i in range(max_len - len(l_brand)):
            l_brand += ['']
        return l_brand
    
    def parent_likelyhood(self, l_comp, l_brand):
        likelyhood = 0
        #check longest matching n-gram, does not check for combinations
        for i in range(len(l_comp)):
            if l_comp[i] == l_brand[i]:
                likelyhood += 1
            else:
                return likelyhood
        return likelyhood
    
    def similarity(self, str_1, str_2):
        return textdistance.levenshtein.normalized_similarity(str_1, str_2)
    
    def get_match(self, brand_query):
        brand_to_search = Brand(tt.processed_text(brand_query))
        if brand_to_search.name in self.brands:
            selected_brand = self.brands[brand_to_search.name]
            while selected_brand.parent is not None:
                selected_brand = selected_brand.parent
            brands = self.get_brand_family(selected_brand.name)
            return brands, 1
        else:
            relevance = 0
            most_relevant = '-'
            for key in self.brands.keys():
                sim = self.similarity(brand_to_search.name, self.brands[key].name)
                if sim > relevance:
                    relevance = sim
                    most_relevant = self.brands[key].name
            brands = self.get_brand_family(most_relevant)
            return brands, relevance
            
    def get_brand_family(self, brand_name):
        fam_list = [brand_name]
        family = []
        while len(fam_list) != 0:
            current = fam_list[0]
            fam_list += [b.name for b in self.brand_family[self.brands[current]]]
            fam_list.remove(current)
            if current not in family:
                family += [current]
        if brand_name in self.manual_matches.keys():
            family += self.manual_matches[brand_name]
        return list(set(family))
    
    def add_manual_match(self, brand_a, brand_b):
        if brand_a not in self.manual_matches.keys():
            self.manual_matches[brand_a] = [brand_b]
        else:
            if brand_b not in self.manual_matches[brand_a]:
                self.manual_matches[brand_a] += [brand_b]
                
        if brand_b not in self.manual_matches.keys():
            self.manual_matches[brand_b] = [brand_a]
        else:
            if brand_a not in self.manual_matches[brand_b]:
                self.manual_matches[brand_b] += [brand_a]

In [18]:
brand_collection = BrandCollection(list(zalando_prod_training['brand']) + list(aboutyou_prod_training['brand']))

In [19]:
brand_enumeration = {}
b_i = 0
for brand in brand_collection.brands.keys():
    brand_enumeration[brand] = b_i
    b_i += 1

### Create table of categorical values 

In [20]:
df_cat_brand_title_color = pd.DataFrame(columns=['brand', 'title', 'color'])

for i in df_offers.index:
    try:
        b_e = brand_enumeration[df_offers.loc[i]['brand']]
        t_e = title_enumeration[create_category(df_offers.loc[i], set_title_terms, 'title')]
        c_e = color_enumeration[create_category(df_offers.loc[i], set_color_terms, 'color')]
        row = {'brand':b_e, 'title': t_e, 'color': c_e}
        df_cat_brand_title_color = df_cat_brand_title_color.append(row, ignore_index=True)
    except:
        pass
df_cat_brand_title_color

Unnamed: 0,brand,title,color
0,83,398,577
1,83,566,1995
2,83,366,1995
3,45,247,0
4,69,628,2321
...,...,...,...
77145,49,0,166
77146,77,146,2084
77147,38,118,1246
77148,103,472,1712


### Save Table of Categorical encode for Brand-Title-Color

In [21]:
df_cat_brand_title_color.to_parquet('brand_title_color.parquet')

### $\chi^2$ for testing independance Brand-Title

In [22]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

#sample
#df_cat_brand_title_color = df_cat_brand_title_color.sample(1000)

# contingency tables
cont_table_title_color = pd.crosstab(df_cat_brand_title_color['title'], df_cat_brand_title_color['color'])
cont_table_brand_color = pd.crosstab(df_cat_brand_title_color['brand'], df_cat_brand_title_color['color'])
cont_table_title_brand = pd.crosstab(df_cat_brand_title_color['title'], df_cat_brand_title_color['brand'])

In [23]:
print('################ Title and color Dependency test ################')
stat_tc, p_tc, dof_tc, expected_tc = chi2_contingency(cont_table_title_color)
print('dof=%d' % dof_tc)
#print(expected)
# interpret test-statistic
prob = 0.95
critical_tc = chi2.ppf(prob, dof_tc)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical_tc, stat_tc))
if abs(stat_tc) >= critical_tc:
    print('Title and Color are Dependent (reject H0)')
else:
    print('Title and Color are Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p_tc))
if p_tc <= alpha:
    print('Title and Color are Dependent (reject H0)')
else:
    print('Title and Color are Independent (fail to reject H0)')
print('#################################################################')

################ Title and color Dependency test ################
dof=1899018
probability=0.950, critical=1902224.718, stat=1234336.960
Title and Color are Independent (fail to reject H0)
significance=0.050, p=1.000
Title and Color are Independent (fail to reject H0)
#################################################################


In [24]:
print('################ Brand and color Dependency test ################')
stat_bc, p_bc, dof_bc, expected_bc = chi2_contingency(cont_table_brand_color)
print('dof=%d' % dof_bc)
#print(expected)
# interpret test-statistic
prob = 0.95
critical_bc = chi2.ppf(prob, dof_bc)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical_bc, stat_bc))
if abs(stat_bc) >= critical_bc:
    print('Brand and Color are Dependent (reject H0)')
else:
    print('Brand and Color are Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p_bc))
if p_bc <= alpha:
    print('Brand and Color are Dependent (reject H0)')
else:
    print('Brand and Color are Independent (fail to reject H0)')
print('#################################################################')

################ Brand and color Dependency test ################
dof=267714
probability=0.950, critical=268918.724, stat=481416.185
Brand and Color are Dependent (reject H0)
significance=0.050, p=0.000
Brand and Color are Dependent (reject H0)
#################################################################


In [25]:
print('################ Title and brand Dependency test ################')
stat_tb, p_tb, dof_tb, expected_tb = chi2_contingency(cont_table_title_brand)
print('dof=%d' % dof_tb)
#print(expected)
# interpret test-statistic
prob = 0.95
critical_tb = chi2.ppf(prob, dof_tb)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical_tb, stat_tb))
if abs(stat_tb) >= critical_tb:
    print('Title and Brand are Dependent (reject H0)')
else:
    print('Title and Brand are Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p_tb))
if p_tb <= alpha:
    print('Title and Brand are Dependent (reject H0)')
else:
    print('Title and Brand are Independent (fail to reject H0)')
print('#################################################################')

################ Title and brand Dependency test ################
dof=81213
probability=0.950, critical=81877.046, stat=577109.109
Title and Brand are Dependent (reject H0)
significance=0.050, p=0.000
Title and Brand are Dependent (reject H0)
#################################################################


In [26]:
# calculate the Pearson's correlation between brand and color
# Pearson's correlation identifies liner relationship
from scipy.stats import pearsonr
corr, _ = pearsonr(df_cat_brand_title_color['brand'], df_cat_brand_title_color['color'])
print('Brand-Color Pearsons correlation: %.3f' % corr)

# calculate the Pearson's correlation between brand and title
corr, _ = pearsonr(df_cat_brand_title_color['brand'], df_cat_brand_title_color['title'])
print('Brand-Title Pearsons correlation: %.3f' % corr)

Brand-Color Pearsons correlation: -0.001
Brand-Title Pearsons correlation: 0.041


In [27]:
# calculate the Spearmans's correlation between brand and color
# Spearmans's correlation identifies non-liner relationship
from scipy.stats import spearmanr
corr, _ = spearmanr(df_cat_brand_title_color['brand'], df_cat_brand_title_color['color'])
print('Brand-Color Spearmans correlation: %.3f' % corr)

# calculate the Spearmans's correlation between brand and title
corr, _ = spearmanr(df_cat_brand_title_color['brand'], df_cat_brand_title_color['title'])
print('Brand-Title Spearmans correlation: %.3f' % corr)

Brand-Color Spearmans correlation: 0.004
Brand-Title Spearmans correlation: 0.041
