In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack
from wordbatch.models import FTRL, FM_FTRL
from nltk.corpus import stopwords

import re
import wordbatch
import pandas as pd
import numpy as np


In [2]:
def rmsle(y, y0): #defining metric
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2))) 

In [3]:
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+') #using only numbers + english alphabet


def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords]) #removing stop words and using only numbers + english alphabet

In [4]:
def handle_missing_inplace(df): #filling all nans
    df['category_name'].fillna(value='missing/missing/missing', inplace=True)
    df['brand_name'].fillna(value='missing', inplace=True)
    df['item_description'].fillna(value='missing', inplace=True)
    return df

In [5]:
train = pd.read_csv('./train.tsv', sep = '\t') #loading train
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [13]:
sample = train.sample(frac = 0.05, random_state = 42)#using 5% sample
sample = handle_missing_inplace(sample) #filling all nans
y = sample.pop('price')

#splitting categories into 3 sub categories
sample['cat1'] = sample['category_name'].apply(lambda x: x.split('/')[0])
sample['cat2'] = sample['category_name'].apply(lambda x: x.split('/')[1])
sample['cat3'] = sample['category_name'].apply(lambda x: x.split('/')[2])

In [16]:
sample.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description,cat1,cat2,cat3
525834,525834,Under armor sweatpants,4,"Women/Athletic Apparel/Pants, Tights, Leggings",Under Armour,1,Used condition size small black in color two p...,Women,Athletic Apparel,"Pants, Tights, Leggings"
149839,149839,Men's watch,3,Men/Men's Accessories/Watches,Tommy Bahama,0,Tommy Bahama watch in good condition with new ...,Men,Men's Accessories,Watches
536234,536234,Eileen Fisher gray Cardigan,3,Women/Sweaters/Cardigan,Eileen Fisher,0,Large but fits medium or small,Women,Sweaters,Cardigan
427908,427908,Blue Patagonia,2,"Men/Sweats & Hoodies/Sweatshirt, Pullover","Patagonia, Inc.",0,No description yet,Men,Sweats & Hoodies,"Sweatshirt, Pullover"
193641,193641,✨4 YMED NIKE PRO for Lindsay✨,1,Kids/Girls (4+)/Other,Nike,0,4 YMED NIKE PRO compression shorts All NWT,Kids,Girls (4+),Other


In [8]:
tf = TfidfVectorizer(max_features=10000,
                     max_df = 0.95, min_df = 100) #using tf-idf preprocessing to convert text in numerical matrix

In [9]:
#Evaluating tf-idf (transformig text into matrix)

print('Working with name')
x_name = tf.fit_transform(sample['name'].values)
print(7*'-')
print('Working with item_description')
x_description = tf.fit_transform(sample['item_description'].values)
print(7*'-')
print('Working with cat1')
x_cat1 = tf.fit_transform(sample['cat1'].values)
print(7*'-')
print('Working with cat2')
x_cat2 = tf.fit_transform(sample['cat2'].values)
print(7*'-')
print('Working with cat3')
x_cat3 =  tf.fit_transform(sample['cat3'].values)
print(7*'-')

Working with name


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


-------
Working with item_description
-------
Working with cat1
-------
Working with cat2
-------
Working with cat3
-------


In [10]:
sample_preprocessed = hstack((x_name, x_description, x_cat1, x_cat2, x_cat3)).tocsr() #concatenating together and 
                                                                                      #using scipy sparse for low-memory
                                                                                      #allocation of matrix 
mask = np.array(np.clip(sample_preprocessed.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sample_preprocessed = sample_preprocessed[:, mask]

x_train, x_val, y_train, y_val = train_test_split(sample_preprocessed, y, test_size = 0.15) #splitting into test and train

In [11]:
model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D = sample_preprocessed.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=16) #defining model

In [12]:
model.fit(x_train, y_train) #training algorithm 
y_pred = model.predict(x_val)#evaluating algorithm 
print('RMSLE score using FM_FTRL:', rmsle(y_val, y_pred))

RMSLE score using FM_FTRL: 0.7428922496558461
