In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

print(pd.__version__)

%matplotlib inline

0.22.0


In [3]:
# Read table

mercari_data = pd.read_table("./train.tsv", index_col = 'train_id', dtype = {'item_condition_id':CategoricalDtype(categories = [str(i) for i in range(1,6)], ordered = True), 'category_name':'category', 'brand_name': 'category', 'shipping':'category'})

  mask |= (ar1 == a)


In [4]:
cat1, cat2, cat3, a = mercari_data.category_name.str.split("/", 3).str
mercari_data['category_1'] = cat1
mercari_data['category_2'] = cat2
mercari_data['category_3'] = cat3
for col in ['category_1', 'category_2', 'category_3']:
    mercari_data[col] = mercari_data[col].astype('category')

In [5]:
mercari_data['log_price'] = np.log(mercari_data.price + 1)

In [6]:
mercari_data[(mercari_data['category_1'] == 'Men') & (mercari_data['category_2'] == 'Tops') & (mercari_data['category_3'] == 'T-shirts')].head(10)

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,log_price
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts,2.397895
78,Trump Shirt,3,Men/Tops/T-shirts,,9.0,0,Worn once; will be washed before sent,Men,Tops,T-shirts,2.302585
151,Nike men's dri-fit sleeveless shirt tee,3,Men/Tops/T-shirts,Nike,14.0,0,This is a men's Nike dri-fit shirt which is bl...,Men,Tops,T-shirts,2.70805
350,Stussy vintage dot Ragland LARGE,1,Men/Tops/T-shirts,,33.0,0,Brand new with tags white/burgundy,Men,Tops,T-shirts,3.526361
352,2-Mens XXL Cruise Ship T-Shirts NWOT,2,Men/Tops/T-shirts,,9.0,0,2-Mens XXL Cruise Ship T-Shirts. Grey-Royal Ca...,Men,Tops,T-shirts,2.302585
432,"'Garbage"" tee",3,Men/Tops/T-shirts,,7.0,1,A black tee with white screen print g and garb...,Men,Tops,T-shirts,2.079442
497,Nintendo 64 Men's T shirt Size 2XL,1,Men/Tops/T-shirts,Nintendo,15.0,1,New t shirt Official Licensed Product Size 2XL,Men,Tops,T-shirts,2.772589
660,Undefeated T-Shirt,2,Men/Tops/T-shirts,Undefeated,19.0,0,Like new condition Quality material Staple piece,Men,Tops,T-shirts,2.995732
1357,Huf Shirt,2,Men/Tops/T-shirts,HUF,22.0,0,"Basically brand new. No wear or tear, great co...",Men,Tops,T-shirts,3.135494
1408,Men's Nike shirt,3,Men/Tops/T-shirts,Nike,34.0,0,"Men's Nike shirt. Black . Used once, can barel...",Men,Tops,T-shirts,3.555348


In [7]:
data_subset = mercari_data[(mercari_data['category_1'] == 'Men') & (mercari_data['category_2'] == 'Tops') & (mercari_data['category_3'] == 'T-shirts')]
print(data_subset['item_description'].count())

data_subset = data_subset[data_subset['item_description'] != 'No description yet']
print(data_subset['item_description'].count())

15108
13851


In [74]:
vectorizer = CountVectorizer(stop_words="english", binary = True, min_df = 50)
transformed = vectorizer.fit_transform(data_subset['item_description'])
sparse_sums = transformed.sum(axis = 0)
feature_names = vectorizer.get_feature_names()
sorted_word_counts = sorted(zip(feature_names, sparse_sums.tolist()[0]), key = lambda x: x[1], reverse = True)


In [80]:
reg = Ridge(alpha=.1)
reg.fit(transformed, data_subset['log_price'])
sorted(zip(feature_names, sparse_sums.tolist()[0], reg.coef_.tolist()), key = lambda x: x[2], reverse = True)


[('important', 80, 0.4675150106551865),
 ('bag', 214, 0.4585050919396132),
 ('ape', 74, 0.4523779001405798),
 ('gucci', 73, 0.4469280795419409),
 ('fighter', 52, 0.4443005035924868),
 ('authentic', 488, 0.43845012010254314),
 ('limited', 59, 0.405036284100835),
 ('hoodie', 52, 0.3571600811533232),
 ('supreme', 328, 0.343638702278222),
 ('odd', 61, 0.3414014259449272),
 ('rare', 127, 0.326787500609027),
 ('religion', 76, 0.30714302637340246),
 ('tshirts', 91, 0.2994836365991043),
 ('jersey', 84, 0.2841664320122767),
 ('patagonia', 54, 0.2727012548845682),
 ('affliction', 112, 0.2660557117592134),
 ('description', 54, 0.24943998758560787),
 ('shirts', 925, 0.2468695586580725),
 ('10', 244, 0.2452562936577449),
 ('jacket', 64, 0.24453627897305152),
 ('vineyard', 99, 0.23671770527207875),
 ('shorts', 62, 0.23348771707032243),
 ('asap', 54, 0.23046839391886087),
 ('lot', 155, 0.22120120399621018),
 ('tees', 120, 0.21352964575568176),
 ('runs', 106, 0.20344942738299693),
 ('fits', 587, 0.196

In [58]:
# Do try experimenting with alpha.  Keep the variance and bias within reason

# Now it's time to decide which featuresd (words) you want to look at.
# Don't choose words like stop-words, which are really common (but appear not to be
# in your vocaubulary anyway since you took them out in the default
# preprocessor) but also don't choose words that don't appear much
# Feb 9 - right now I'm just changing min_df in CountVectorizer to set that threshold, nothing
# complicated like in sklearn.feature_selection

# In addition to the minimum threshold, you can use some module from
# sklearn.feature_selection, like sklearn.feature_selection.mutual_info_regression
# to decide which words actually matter.

# the sklearn website has a good example on the feature_selection tools
# and compares the results of one to SVM weights

# Feb 9 - you looked at ridge regression for various minimum occurence thresholds and alphas (regularization constant),
# and the coeficients (accesible with Ridge.coef_ (a numpy.ndarray)) barely changed at all,
# suggesting that cross-correlation is not an issue with regression.  Now I'm seeing if this is true by
# trying a sparse principal components analysis to see explicitly (if this is possible) whether certain words
# are related