In [1]:
# Notebook to create a brand prediction model for Beauty & Health

In [2]:
# Standard settings to make notebook more legible

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)
np.set_printoptions(precision=6,threshold=5000,linewidth=200,suppress=True)

In [3]:
# define column names and dtypes

col_names = ["id","title","description","brand_id","brand_name","ean","parent_id","chunk_id","brick_id"]
dtypes = {"id":object,"title":object,"description":object,"brand_id":object,"brand_name":object,"ean":object,"parent_id":object,"chunk_id":object,"brick_id":object}

In [4]:
data_orig = pd.read_csv("beauty_leverbaar_20160729.txt",sep="\t",header=None,names=col_names,dtype=dtypes)
data = data_orig

In [5]:
data_full = data[data['brand_id'].notnull()]
data_null = data[data['brand_id'].isnull()]
data_full.reset_index(inplace=True,drop=True)
data_null.reset_index(inplace=True,drop=True)

In [6]:
# for category Beauty the title worked best as a predictor for the brand_id
col_names_y = ['brand_id']
col_names_X = ['title']

In [7]:
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(data_full['brand_id'],n_folds=5,shuffle=True,random_state=42)



In [8]:
train_indices, test_indices = next(iter(kf))

In [9]:
X_train, y_train = data_full[col_names_X].loc[train_indices], data_full[col_names_y].loc[train_indices]
X_test, y_test = data_full[col_names_X].loc[test_indices], data_full[col_names_y].loc[test_indices]

In [11]:
# SGDClassifier gives way better results than Multinomial Bayes

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
        ('tf_idf', TfidfVectorizer(strip_accents='unicode',ngram_range=(1,2),sublinear_tf=True)),
        ('sgd', SGDClassifier(loss='modified_huber', verbose=0, n_iter=50))
    ])

In [12]:
pipeline.fit(X_train.values.ravel(),y_train.values.ravel())

Pipeline(steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [13]:
# accuracy on test set is almost 95%

from sklearn.metrics import accuracy_score

print(accuracy_score(y_train.values.ravel(), pipeline.predict(X_train.values.ravel())))
print(accuracy_score(y_test.values.ravel(), pipeline.predict(X_test.values.ravel())))

0.995693051106
0.948275862069


In [15]:
# with this we have a model to be used in the webapp

from sklearn.externals import joblib

joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl',
 'pipeline.pkl_01.npy',
 'pipeline.pkl_02.npy',
 'pipeline.pkl_03.npy',
 'pipeline.pkl_04.npy',
 'pipeline.pkl_05.npy',
 'pipeline.pkl_06.npy']

In [16]:
pipeline = joblib.load('pipeline.pkl')

In [18]:
# this creates a brand translator also for use in the webapp
# the model predicts brand_id's, so i need a dictionary to see which brand_names belong to the brand id

brand_id_list = data_full['brand_id'].tolist()
brand_name_list = data_full['brand_name'].tolist()

brand_translator = {}
for i in range(len(brand_id_list)):
    if brand_translator.get(brand_id_list[i],None) is None:
        brand_translator[brand_id_list[i]] = brand_name_list[i]
        
import pickle

pickle.dump(brand_translator, open( "brand_translator.pkl", "wb" ) )

In [21]:
# brand_translator = pickle.load( open( "brand_translator.pkl", "rb" ) )