In [9]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup

In [10]:
data = pd.read_csv("../../raw_train_test.csv").fillna('')

What percentage of the product descriptions are encoded in HTML?

In [11]:
records_with_html = data[data['product_description'].str.contains(r'<.*></.*>')]
print(
    "%d records of %d, or %0.2f%%, contain HTML" % (
        records_with_html.shape[0],
        data.shape[0],
        100*records_with_html.shape[0]/data.shape[0]
    )
)

1186 records of 32671, or 3.63%, contain HTML


Lets get a sense of what the HTML records look like:

In [12]:
html_sample = records_with_html['product_description'].sample(n=40)
html_sample

21187    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
17099    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
13513    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
11855    eBay item number:371093132910\n\n\n\tSeller as...
31880    eBay item number:271839111115\n\n\n\tSeller as...
32341    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
30243    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
19346    eBay item number:111453555354\n\n\n\tSeller as...
17818    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
25606    eBay item number:351093533969\n\n\n\tSeller as...
26049    eBay item number:291425349623\n\n\n\tSeller as...
22805    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...
28776    eBay item number:381125774374\n\n\n\tSeller as...
23198    eBay item number:121628713307\n\n\n\tSeller as...
31102    eBay item number:181675692099\n\n\n\tSeller as...
29286    eBay item number:201334732759\n\n\n\tSeller as...
17175    <ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\.

The product descriptions containing HTML seem to come in two distinct flavors:
    - Those beginning with an eBay item number
    - Those beginning with "<ul>\n\t\t<li>\n\t\t\tEnglish"
How many are formatted this way?

In [13]:
begin_with_ebay_string = records_with_html[
    records_with_html['product_description'].str.contains(
        "^eBay item number"
    )
]
print("%d records of %d, or %0.2f%% contain the string" % (
        begin_with_ebay_string.shape[0],
        records_with_html.shape[0],
        100*begin_with_ebay_string.shape[0]/records_with_html.shape[0]
    )
)

570 records of 1186, or 48.06% contain the string


In [14]:
begin_with_html_string = records_with_html[
    records_with_html['product_description'].str.contains(
        "^<ul>\n\t\t<li>\n\t\t\tEnglish"
    )
]
print("%d records of %d, or %0.2f%% contain the string" % (
        begin_with_html_string.shape[0],
        records_with_html.shape[0],
        100*begin_with_html_string.shape[0]/records_with_html.shape[0]
    )
)

587 records of 1186, or 49.49% contain the string


Lets take the first 20 characters of each description, and see how many distinct groups there are:

In [15]:
first20 = records_with_html['product_description'].apply(lambda row: row[:16]).unique()
for descr in first20:
    print(repr(descr))

'<table>\n\t<tr>\n\t\t'
'eBay item number'
'International Bu'
'<ul>\n\t\t<li>\n\t\t\tE'
'The Nintendo Wii'
'2. International'
'front seat featu'
'100% BRAND NEW H'
'Brand<td>Kitchen'
'Original Studio '
'<p><p></p>\n<p></'
'<p></p>'
'Ancona Modular W'
'<ul>\n     <li>Al'
'Backup data on a'
'<p>\n</p>\n\n<td>\n<'
'<li>YOU ARE BIDD'
'<table>\n  <tbody'
'Originally relea'
'&ldquo;Refurbish'
'<strong>This dur'


Out of 1186 HTML-containing records, (587 + 570) = 1157 records begin with the same strings. If the records within each category are structured similarly, we can programmatically extract the useful data from those records. Lets examine the structure of the records starting with "eBay item number"

In [16]:
ebay1, ebay2 = begin_with_ebay_string['product_description'].sample(2)
ebay1

"eBay item number:261765583333\n\n\n\tSeller assumes all responsibility for this listing.\n\t\n\t\tLast updated on\n\t\t&nbsp;May 04, 2015 07:32:40 PDT&nbsp;\n\t\tView all revisions\n\t\t\n\t\n\n\t\t\n\t\t\t\n\t\t\t\t\t<strong>Item specifics</strong>\n\t\t\t\t\t<table>\n\t\t\t\t\t\t\n\t\t\t\t\t\t<tr>\n\t\t\t\t\t\t\t\t\t<td>\n\t\t\t\t\t\t\t\t\t \t\t\tCondition:</td>\n\t\t\t\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\t\t\t<td>\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tNew: A brand-new, unused, unopened, undamaged item (including handmade items). See the seller's listing \n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tfor full details.\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSee all condition definitions<strong>- opens in a new window or tab</strong>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\

In [17]:
ebay2

"eBay item number:161352947813\n\n\n\tSeller assumes all responsibility for this listing.\n\t\n\t\tLast updated on\n\t\t&nbsp;Apr 16, 2015 23:26:28 PDT&nbsp;\n\t\tView all revisions\n\t\t\n\t\n\n\t\t\n\t\t\t\n\t\t\t\t\t<strong>Item specifics</strong>\n\t\t\t\t\t<table>\n\t\t\t\t\t\t\n\t\t\t\t\t\t<tr>\n\t\t\t\t\t\t\t\t\t<td>\n\t\t\t\t\t\t\t\t\t \t\t\tCondition:</td>\n\t\t\t\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\t\t\t<td>\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tNew: A brand-new, unused, unopened, undamaged item in its original packaging (where packaging is \n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tapplicable). Packaging should be the same as what is found in a retail store, unless the item is handmade or was packaged by the manufacturer in non-retail packaging, such as an unprinted box or plastic bag. See the s

Now lets look at the records beginning with `"<ul>\n\t\t<li>\n\t\t\tEnglish"`, presumably the results of some translation service.

In [18]:
html1, html2 = begin_with_html_string['product_description'].sample(2)
html1

"<ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\n    \t<li>\n    \t\t    \n    \t</li>\n    \t<li>\n    \t\t\t \n    \t\t\t \n    \t\t</li>\n    \t</ul>\n\n    \n\t\t\tThis translation tool is for your convenience only. The accuracy and accessibility of the resulting translation is not guaranteed.\n\t\n\t\n\n\n\n\t\t\n\t\t\n\t\t\t\t\t\t\t<ul>\n\t\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\t\t\t\tEnglishEnglish\n\t\t\t\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\tالعربيةArabic\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\t中文（简体）Chinese (Simplified)\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\t中文（繁体）Chinese (Traditional)\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tČeskyCzech\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tNederlandsDutch\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tSuomiFinnish\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tΕλληνικάGreek\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tעבריתHebrew\n\t\t\t\t\t\t\t\t</

In [19]:
html2

'<ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\n    \t<li>\n    \t\t    \n    \t</li>\n    \t<li>\n    \t\t\t \n    \t\t\t \n    \t\t</li>\n    \t</ul>\n\n    \n\t\t\tThis translation tool is for your convenience only. The accuracy and accessibility of the resulting translation is not guaranteed.\n\t\n\t\n\n\n\n\t\t\n\t\t\n\t\t\t\t\t\t\t<ul>\n\t\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\t\t\t\tEnglishEnglish\n\t\t\t\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\tالعربيةArabic\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\t中文（简体）Chinese (Simplified)\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\t中文（繁体）Chinese (Traditional)\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tČeskyCzech\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tNederlandsDutch\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tSuomiFinnish\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tΕλληνικάGreek\n\t\t\t\t\t\t\t\t</li>\n\t\t\t\t\t<li>\n\t\t\t\t\t\t\tעבריתHebrew\n\t\t\t\t\t\t\t\t</

It turns out that both the records beginning with "eBay item number" and the translation information are structured largely similarly: the relevant product info begins after the term "Item Specifics", which is enclosed in a `<strong>` tag. Lets find out how many of all the HTML-encoded records are structured this way:

In [20]:
records_with_html[
    records_with_html['product_description'].str.contains(
        r"^(.|\n)*Item specifics</strong>"
    )].shape[0], records_with_html.shape[0]



(1154, 1186)

It seems then that we have product descriptions falling into three categories:
    - Category A: Those encoded in HTML and containing the string 
      "Item Specifics", after which lies the relevant product information.
    - Category B: Those encoded in HTML that don't contain that string.
    - Category C: Those not encoded in HTML.

We can parse all three together by:
    - Passing through BeautifulSoup's get_text() function (it'll leave 
      non-HTML unalterred.
    - Removing the regex "^(.|\n)*Item specifics" from strings that have 
      it. It'll leave others unalterred.
    - Replacing newlines tabs etc with whitespace, stripping 
      any extra whitespace.

Lets get a sample from each category:

In [21]:
category_a = records_with_html[
    records_with_html['product_description'].str.contains(
        r"^(.|\n)*Item specifics</strong>")]
category_b = records_with_html[
    ~records_with_html['product_description'].str.contains(
        r"^(.|\n)*Item specifics</strong>")]
category_c = data[~data['product_description'].str.contains(r'<.*></.*>')]
print("Category A: %d records" % category_a.shape[0])
print("Category B: %d records" % category_b.shape[0])
print("Category C: %d records" % category_c.shape[0])

Category A: 1154 records
Category B: 32 records
Category C: 31485 records


In [29]:
cat_a_sample = category_a['product_description'].sample(1).values[0]
cat_b_sample = category_b['product_description'].sample(1).values[0]
cat_c_sample = category_c['product_description'].sample(1).values[0]

In [23]:
def preprocess_text(description):
    without_html_tags = BeautifulSoup(description).get_text()
    without_headers = re.sub(r"^(.|\n)*Item specifics","",without_html_tags)
    without_symbols = re.sub(r"[^a-zA-Z0-9]"," ",without_headers)
    without_extra_spaces = " ".join(without_symbols.split())
    return without_extra_spaces

In [33]:
def faster_preprocess(description):
    no_html = BeautifulSoup(description).get_text()
    no_symbols = re.sub(r"(^(.|\n)*Item specifics)|([^a-zA-Z0-9])"," ",no_html)
    no_extra_spaces = " ".join(no_symbols.split())
    return no_extra_spaces

In [39]:
preprocess_text(cat_a_sample)

'Condition New A brand new unused unopened undamaged item in its original packaging where packaging is applicable Packaging should be the same as what is found in a retail store unless the item is handmade or was packaged by the manufacturer in non retail packaging such as an unprinted box or plastic bag See the seller s listing for full details See all condition definitions opens in a new window or tab Read moreabout the condition Room Living Room Dining Room Style Antique Features Analog Round SKU S041865300009 Material Metal Size L Color Wood 007estorm 007estorm 16583 98 7'

In [40]:
preprocess_text(cat_b_sample)

'Backup data on a faster hard disk with a monstrous capacity Seagate Barracuda 2 TB internal hard drive The Seagate internal hard drive has a 7 200 rpm rotational speed which boosts reading from and writing to the hard drive This 2 TB Seagate hard drive boots your OS faster and loads applications much quicker The Seagate Barracuda 2 TB internal hard drive s 64 MB cache memory and faster average seek time make for reduced power consumption and generating lesser noise and vibrations Connecting via a SATA 2 or SATA 3 standard port this Seagate internal hard drive provides a very high data transfer rate The Seagate hard drive can be installed in any Windows based desktops Product FeaturesScreaming fast with plenty of room to growThe Barracuda XT hard drive from Seagate offers customers a compelling storage choice for high capacity high performance computing solutions The Barracuda XT drive is built on a full speed 7200 rpm 4 disk platform Further turbocharged with a huge 64 MB cache and th

In [41]:
preprocess_text(cat_c_sample)

'This Stockton Blue Rescued Denim Rug is constructed from handwoven rescued off white and blue denim This contemporary styled wool rug adds an elegant touch to your home decor This rug is not recommended for high traffic areas'

In [45]:
(faster_preprocess(cat_a_sample) == preprocess_text(cat_a_sample),
faster_preprocess(cat_b_sample) == preprocess_text(cat_b_sample),
faster_preprocess(cat_c_sample) == preprocess_text(cat_c_sample))

(True, True, True)

In [27]:
def tokenize(text):
    processed_text = preprocess_text(text)
    tokens = nltk.word_tokenize(processed_text)
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.PorterStemmer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

def token_text(text):
    return " ".join(tokenize(text))

In [28]:
cat_c_sample

''