In [14]:
import re
import math
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_log_error

from keras import backend as K
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Word2Vec

In [2]:
train = pd.read_csv("/mnt/disks/~/clean.csv")
cloth = train[(train.cat1==554)|(train.cat1==934)]
print(cloth.shape)
cloth.head(3)

(758065, 13)


Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat1,cat2,cat3,cat4,cat5,target
0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,4786,10.0,1,no description yet,554,859,827,950,950,-0.369464
1,ava-viv blouse,1,Women/Tops & Blouses/Blouse,4180,10.0,1,adorable top with a hint of lace and a key hol...,934,860,104,950,950,-0.369464
2,24k gold plated rose,1,Women/Jewelry/Necklaces,4786,44.0,0,complete with certificate of authenticity,934,480,584,950,950,0.000978


# Tokenizing

In [3]:
# Tokenizing
cloth['token_name'] = [text_to_word_sequence(w) for w in cloth['name']]
cloth['token_item_description'] = [text_to_word_sequence(w) for w in cloth['item_description']]

In [5]:
word2vec = Word2Vec(cloth['token_item_description'], size=2, window=5, min_count=5, sg=0)

In [6]:
ordered_vocabulary = [(term, voc.index, voc.count) for term, voc in word2vec.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocabulary = sorted(ordered_vocabulary, key=lambda x: -x[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocabulary)
# create a DataFrame with the word2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(word2vec.wv.syn0[:], index=ordered_terms)

In [7]:
word_vectors

Unnamed: 0,0,1
and,-0.639325,0.610294
size,1.067381,2.078361
a,-1.818077,0.019953
the,-1.723899,-0.038220
in,-1.207547,0.380990
with,0.539387,1.488129
new,0.377334,1.562561
for,-3.533791,-0.913277
is,-1.602562,0.107850
to,-4.997384,-1.554485


In [8]:
cloth = pd.read_csv('/mnt/disks/~/word.csv')
cloth['token_name'] = [text_to_word_sequence(w) for w in cloth['name']]
cloth['token_item_description'] = [text_to_word_sequence(w) for w in cloth['item_description']]

In [9]:
vec = np.zeros((cloth.shape[0],2))
for i in range(cloth.shape[0]):
    for w in cloth['token_item_description'][i]:
        try:
            vec[i] = vec[i] + word_vectors[word_vectors.index==w]
        except:
            vec[i] = vec[i] + [0,0]

In [27]:
items = ['missing', 'shirt', 't-shirt', 't shirt', 'pants', \
         'jeans', 'trousers', 'jacket', 'coat', \
         'sweater', 'hat', 'cap', 'dress', 'shorts', \
         'underwear', 'socks', 'blouse', 'shoes', 'boots']

def getItem(text):
    item = 'missing'
    for w in items:
        if re.search(w, text):
            item = w
    return item

cloth['item_name'] = [getItem(text) for text in cloth['name']]

item_name_le = LabelEncoder()
item_name_le.fit(items)
cloth.item_name = item_name_le.transform(cloth.item_name)

In [10]:
vec = pd.DataFrame(vec,columns=['vec1','vec2'])
cloth = pd.concat([cloth,vec],axis=1)
cloth.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat1,cat2,cat3,cat4,cat5,target,token_name,token_item_description,vec1,vec2
0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,4786,10.0,1,no description yet,554,859,827,950,950,-0.369464,"[mlb, cincinnati, reds, t, shirt, size, xl]","[no, description, yet]",-28.168664,-9.071722
1,ava-viv blouse,1,Women/Tops & Blouses/Blouse,4180,10.0,1,adorable top with a hint of lace and a key hol...,934,860,104,950,950,-0.369464,"[ava, viv, blouse]","[adorable, top, with, a, hint, of, lace, and, ...",-19.729551,24.155161
2,24k gold plated rose,1,Women/Jewelry/Necklaces,4786,44.0,0,complete with certificate of authenticity,934,480,584,950,950,0.000978,"[24k, gold, plated, rose]","[complete, with, certificate, of, authenticity]",-7.943107,0.258878
3,bundled items requested for ruie,3,Women/Other/Other,4786,59.0,0,"banana republic bottoms, candies skirt with ma...",934,609,609,950,950,0.076625,"[bundled, items, requested, for, ruie]","[banana, republic, bottoms, candies, skirt, wi...",29.092519,39.313323
4,acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,79,64.0,0,size small but straps slightly shortened to fi...,934,824,894,950,950,0.097672,"[acacia, pacific, tides, santorini, top]","[size, small, but, straps, slightly, shortened...",-16.265248,6.449293


In [66]:
from sklearn.neural_network import MLPRegressor

X = cloth.loc[:,['item_name','item_condition_id','brand_name','shipping','cat1','cat2','cat3',\
                 'vec1','vec2']]
y = cloth['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [74]:
def rmsle(ytrue,y):
    return np.sqrt(mean_squared_log_error(ytrue,y))

rr = RandomForestRegressor(n_estimators=500,min_samples_leaf=50,max_depth=20)
rr.fit(X_train,y_train)

y_pred_tr = rr.predict(X_train)
score_train = rmsle(y_train,y_pred_tr)

y_pred = rr.predict(X_test)
score_test = rmsle(y_test,y_pred)

print(score_train)
print(score_test)

0.72112973372
0.752816775926


In [72]:
import pickle
pickle.dump(rr, open('/mnt/disks/~/rr.sav','wb'), protocol=2)

In [73]:
pickle.dump(word_vectors, open('/mnt/disks/~/wordvector.sav','wb'), protocol=2)