In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 160)
print pd.__version__

0.17.1


In [2]:
df = pd.read_csv("./attributes.csv", dtype={'value': object})
df = df[~pd.isnull(df.name)]
df[df.product_uid == 100002]

Unnamed: 0,product_uid,name,value
15,100002,Application Method,"Brush,Roller,Spray"
16,100002,Assembled Depth (in.),6.63 in
17,100002,Assembled Height (in.),7.76 in
18,100002,Assembled Width (in.),6.63 in
19,100002,Bullet01,"Revives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks"
20,100002,Bullet02,100% acrylic solid color coating
21,100002,Bullet03,Resists cracking and peeling and conceals splinters and cracks up to 1/4 in.
22,100002,Bullet04,"Provides a durable, mildew resistant finish"
23,100002,Bullet05,Covers up to 75 sq. ft. in 2 coats per gallon
24,100002,Bullet06,"Creates a textured, slip-resistant finish"


In [3]:
print df.describe()
print df.columns.values
print df.dtypes

          product_uid
count  2044648.000000
mean    170495.654144
std      36374.522113
min     100001.000000
25%     140463.750000
50%     177406.000000
75%     200558.000000
max     224428.000000
['product_uid' 'name' 'value']
product_uid    float64
name            object
value           object
dtype: object


In [4]:
# Many of the values are true/false/yes/no properties that we should transform into the name of the value. For example,
# one yes/no value is for the attribute "Zippered Top", let's change the value to "ZipperedTop" or "NonZipperedTop"
binary_attrs = df[(df.value.str.lower() == 'yes') | (df.value.str.lower() == 'no')].name.unique()
print np.sort(binary_attrs)

idx = df.name.isin(binary_attrs) & (df.value.str.lower() == 'yes')
df.loc[idx, 'value'] = df[idx].name.map(lambda x: str(x).replace(' ', ''))

idx = df.name.isin(binary_attrs) & (df.value.str.lower() == 'no')
df.loc[idx, 'value'] = df[idx].name.map(lambda x: 'Non' + str(x).replace(' ', ''))

df[df.name.isin(binary_attrs)].head()

['15 Gauge Finish Nailer Included' '16 Gauge Finish Nailer Included'
 '18 Gauge Finish Stapler Included' ..., 'Yaw Adjustment' 'Zippered Top'
 'Zone-specific Sounds']


Unnamed: 0,product_uid,name,value
32,100002,Concrete Use,ConcreteUse
35,100002,Deck Use,DeckUse
38,100002,Mildew Resistant,MildewResistant
44,100002,Sealer,NonSealer
46,100002,Tintable,NonTintable


In [5]:
# If the name begins with 'number of', put the object in the value. Ex) "2" -> "2 Panels"

# Standardize the identifying phrase for measurement attributes
df.loc[df.name.str.lower().str.startswith('# of'), 'name'] = df.name.map(lambda x: str(x).replace('# of', 'Number of'))

# Find all the unique measurement attributes
count_attrs = df[df.name.str.lower().str.startswith('number of')].name.unique()
print np.sort(count_attrs)[:10]

# Prepend the attribute name (without 'Number of') to the value
idx = df.name.isin(count_attrs)
df.loc[idx, 'value'] = df[idx].value + ' ' + df[idx].name.map(lambda x: str(x).replace('Number of', ''))
df[idx].head()

# We could make further modifications to change values of "0" or "None" to "No" so it is closer to what a search term may
# be. Ex) "0 Bulbs Required" -> "No Bulbs Required"

['Number of Activities' 'Number of Adjustable Correction Levels'
 'Number of Adjusting Holes' 'Number of Anchor Points'
 'Number of Attachments' 'Number of BNC' 'Number of Balanced Audio Inputs'
 'Number of Balanced Audio Outputs' 'Number of Batteries Included'
 'Number of Batteries Required']


Unnamed: 0,product_uid,name,value
10,100001,Number of Pieces,1 Pieces
95,100004,Number of Panels,4 Panels
123,100005,Number of Faucet Handles,Single Handle Faucet Handles
124,100005,Number of showerheads,1 showerheads
125,100005,Number of Spray Settings,1 Spray Settings


In [6]:
# Many of the values are dimensional properties (len, width, etc.) that we can prefix with the measure type
# Ex) "8" -> "A/C Coverage Area (sq. ft.) 8"

# We'll want to exclude the measurement attributes that start with "Number of"
numof = df.name.str.lower().str.startswith('number of')

# Build a list of all the other measurement attributes
measure_attrs = []
measure_attrs.extend(df[~numof & df.name.str.contains("depth", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("height", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("length", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("width", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("thickness", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("diameter", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("temperature", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("area \(", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("area covered", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("size", case=False)].name.unique())
measure_attrs.extend(df[~numof & df.name.str.contains("opening \(", case=False)].name.unique())
measure_attrs = np.array(np.sort(list(set(measure_attrs))))
print measure_attrs

# Prepend the name to the value
idx = df.name.isin(measure_attrs)
df.loc[idx, 'value'] = df[idx].name + ' ' + df[idx].value
df[idx].head()

['A/C Coverage Area (sq. ft.)' 'Actual Blind/Shade Width (In.)'
 'Actual Color Temperature (K)' ..., 'Wreath Diameter (In.)' 'Wrench Size'
 'Wrench length (in.)']


Unnamed: 0,product_uid,name,value
11,100001,Product Depth (in.),Product Depth (in.) 1.5
12,100001,Product Height (in.),Product Height (in.) 3
14,100001,Product Width (in.),Product Width (in.) 3
16,100002,Assembled Depth (in.),Assembled Depth (in.) 6.63 in
17,100002,Assembled Height (in.),Assembled Height (in.) 7.76 in


In [7]:
# Combine all the attributes from the same product into a single value
df['attributes'] = df.groupby('product_uid')['value'].transform(lambda x: ' '.join(map(str, x)))
df = df[['product_uid', 'attributes']].drop_duplicates()
df.head()

Unnamed: 0,product_uid,attributes
0,100001,Versatile connector for various 90° connections and home repair projects Stronger than angled nailing or screw fastening alone Help ensure joints are consis...
15,100002,"Brush,Roller,Spray Assembled Depth (in.) 6.63 in Assembled Height (in.) 7.76 in Assembled Width (in.) 6.63 in Revives wood and composite decks, railings, po..."
50,100003,"Built-inflange Slightly narrower for tighter spaces Designed with an 18 in. apron Durable high-gloss finish provides a smooth, shiny surface that is easy to..."
82,100004,8.56 Positive power tolerance (0 to +5-Watt) Anti-reflective and anti-soiling surface reduces power loss from dirt and dust Outstanding performance in low-l...
107,100005,"Combo Tub and Shower NonBuilt-inWaterFilter Includes the trim kit only, the rough-in kit (R10000-UNBX) is sold separately Includes the handle Maintains a ba..."


In [8]:
# We've got the attributes concatentated by product id, let's join them with the product descriptions too
descdf = pd.read_csv('product_descriptions.csv')
descdf.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various ..."
1,100002,"BEHR Premium Textured DECKOVER is an innovative solid color coating. It will bring your old, weathered wood or concrete back to life. The advanced 100% acry..."
2,100003,"Classic architecture meets contemporary design in the Ensemble Curve series, made of solid Vikrell material, blending sleek, clean lines with gentle curves...."
3,100004,The Grape Solar 265-Watt Polycrystalline PV Solar Panel bonus pack bundles 4 Grape Solar 265-Watt solar panels for extra savings. The Grape Solar 265-Watt P...
4,100005,"Update your bathroom with the Delta Vero Single-Handle Shower Faucet Trim Kit in Chrome. It has a sleek, modern and minimalistic aesthetic. The MultiChoice ..."


In [9]:
# Make a single column that contains all words for each product
df = df.merge(descdf, on='product_uid')
df['document'] = df.attributes + ' ' + df.product_description
del df['attributes']
del df['product_description']
df.head()

Unnamed: 0,product_uid,document
0,100001,Versatile connector for various 90° connections and home repair projects Stronger than angled nailing or screw fastening alone Help ensure joints are consis...
1,100002,"Brush,Roller,Spray Assembled Depth (in.) 6.63 in Assembled Height (in.) 7.76 in Assembled Width (in.) 6.63 in Revives wood and composite decks, railings, po..."
2,100003,"Built-inflange Slightly narrower for tighter spaces Designed with an 18 in. apron Durable high-gloss finish provides a smooth, shiny surface that is easy to..."
3,100004,8.56 Positive power tolerance (0 to +5-Watt) Anti-reflective and anti-soiling surface reduces power loss from dirt and dust Outstanding performance in low-l...
4,100005,"Combo Tub and Shower NonBuilt-inWaterFilter Includes the trim kit only, the rough-in kit (R10000-UNBX) is sold separately Includes the handle Maintains a ba..."
