In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv

# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'

min_queries = 1000

In [2]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

In [3]:
df.head()

Unnamed: 0,category,query
0,abcat0101001,Televisiones Panasonic 50 pulgadas
1,abcat0101001,Sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca


In [4]:
parents_df.head()

Unnamed: 0,category,parent
0,abcat0010000,cat00000
1,abcat0011000,abcat0010000
2,abcat0011001,abcat0011000
3,abcat0011002,abcat0011000
4,abcat0011003,abcat0011000


In [5]:
# IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming.
df["query"] = df["query"].str.lower()
df["query_tokens"] = df["query"].str.split()
df["query_stemmed_tokens"] = df["query_tokens"].apply(lambda query: [stemmer.stem(word) for word in query])
df["query_stemmed"] = df["query_stemmed_tokens"].str.join(' ')

In [6]:
df.head()

Unnamed: 0,category,query,query_tokens,query_stemmed_tokens,query_stemmed
0,abcat0101001,televisiones panasonic 50 pulgadas,"[televisiones, panasonic, 50, pulgadas]","[television, panason, 50, pulgada]",television panason 50 pulgada
1,abcat0101001,sharp,[sharp],[sharp],sharp
2,pcmcat193100050014,nook,[nook],[nook],nook
3,abcat0101001,rca,[rca],[rca],rca
4,abcat0101005,rca,[rca],[rca],rca


In [7]:
df.category.value_counts()

cat02015              177638
abcat0101001           80213
pcmcat247400050000     79245
pcmcat209000050008     74258
pcmcat144700050004     43991
                       ...  
pcmcat230600050054         1
pcmcat230600050036         1
pcmcat221400050012         1
pcmcat254000050002         1
pcmcat221400050013         1
Name: category, Length: 1486, dtype: int64

In [8]:
# IMPLEMENT ME: Roll up categories to ancestors to satisfy the minimum number of queries per category.
df_with_counts = df.groupby('category').size().reset_index(name="count")

In [9]:
df_merged = df.merge(df_with_counts, how="left", on="category").merge(parents_df, how="left", on="category")

In [10]:
len(df_merged)

1854998

In [11]:
df_merged.head()

Unnamed: 0,category,query,query_tokens,query_stemmed_tokens,query_stemmed,count,parent
0,abcat0101001,televisiones panasonic 50 pulgadas,"[televisiones, panasonic, 50, pulgadas]","[television, panason, 50, pulgada]",television panason 50 pulgada,80213,abcat0101000
1,abcat0101001,sharp,[sharp],[sharp],sharp,80213,abcat0101000
2,pcmcat193100050014,nook,[nook],[nook],nook,13826,pcmcat223300050025
3,abcat0101001,rca,[rca],[rca],rca,80213,abcat0101000
4,abcat0101005,rca,[rca],[rca],rca,1042,abcat0101000


In [12]:
while len(df_merged[df_merged["count"] < min_queries]) > 0:
    df_merged.loc[df_merged['count'] < min_queries, 'category'] = df_merged['parent']
    df = df_merged[['category', 'query']]
    df = df[df['category'].isin(categories)]
    df_with_counts = df.groupby('category').size().reset_index(name='count')
    df_merged = df.merge(df_with_counts, how='left', on='category').merge(parents_df, how='left', on='category')

In [13]:
len(df_merged)

1850373

In [14]:
# Create labels in fastText format.
df['label'] = '__label__' + df['category']

# Output labeled query data as a space-separated file, making sure that every category is in the taxonomy.
df = df[df['category'].isin(categories)]
df['output'] = df['label'] + ' ' + df['query']
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)