# Training a model to predict the brand of a health product from just the title of that product

In [1]:
# import necessary libaries

# data wrangling
import numpy as np
import pandas as pd

# machine learning
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score

from joblib import dump, load

In [2]:
# import data
data = pd.read_csv(
    'beauty_leverbaar_20160729.txt', 
    sep='\t', 
    encoding='utf-8', 
    header=None, 
    usecols=[0, 1, 3, 4], 
    names=['global_id', 'title', 'brand_id', 'brand_name'],
)
data.head(3)

Unnamed: 0,global_id,title,brand_id,brand_name
0,1004004012991345,Create The Look Lipgloss Set,,
1,9000000012514720,Beurer BS69 Spiegel met Verlichting,4493486.0,Beurer
2,9000000012514722,Beurer BS49 Spiegel met Verlichting,4493486.0,Beurer


In [3]:
# data for test and train (where brand_id is not null)
df_brand = data[data['brand_id'].notnull()].reset_index(drop=True)
df_brand['brand_id'] = df_brand['brand_id'].astype(int)

# data for real validation (where brand_id is null)
df_no_brand = data[data['brand_id'].isnull()].reset_index(drop=True)

In [4]:
# all data with brand_id filled
df_brand.head(3)

Unnamed: 0,global_id,title,brand_id,brand_name
0,9000000012514720,Beurer BS69 Spiegel met Verlichting,4493486,Beurer
1,9000000012514722,Beurer BS49 Spiegel met Verlichting,4493486,Beurer
2,9200000002890724,Chicco - Veilige Wattenstaafjes Sicurnet,5329407,Chicco


In [5]:
# get indices for train and test split
train_index, test_index = train_test_split(
    df_brand.index, 
    test_size=0.2, 
    shuffle=True,
    random_state=42
)

In [6]:
# create train and test set
X = df_brand['title']
y = df_brand['brand_id']

X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]

In [7]:
# set up pipeline for training model: first vectorizing title and then using a linear classifier
pipeline = Pipeline([
    ('tf_idf', TfidfVectorizer(strip_accents='unicode', ngram_range=(1,2), sublinear_tf=True, max_features=10000)),
    ('sgd', SGDClassifier(loss='modified_huber', verbose=0, max_iter=50, tol=1e-3, n_iter_no_change=5))
])

In [8]:
# train the model
beauty_brand_model = pipeline.fit(X_train, y_train)

In [9]:
# check accuracy
print(f'Train accuracy: {accuracy_score(y_train, beauty_brand_model.predict(X_train))}')
print(f'Test accuracy:  {accuracy_score(y_test, beauty_brand_model.predict(X_test))}')

Train accuracy: 0.986681234066906
Test accuracy:  0.9357053682896379


In [10]:
# have a dictionary that can convert a predicted brand_id to the corresponding brand_name
brand_id_to_brandname = (df_brand[['brand_id', 'brand_name']]
    .drop_duplicates(subset=['brand_id'])
    .set_index('brand_id')
    .to_dict().get('brand_name')
)

In [11]:
# for a given title return the top 3 brands + probabilities
def predict_top3_brands_for_title(title):
    probas = beauty_brand_model.predict_proba(title).ravel()
    top3_index = np.argsort(probas)[::-1][:3]
    top3_brandids = beauty_brand_model.named_steps['sgd'].classes_[top3_index]
    top3_values = probas[top3_index].round(3)
    top3_brand_id_to_proba = dict(zip(top3_brandids, top3_values))
    
    for brand_id in top3_brandids:
        print(f'brandid: {brand_id}\n'
              f'brandname: {brand_id_to_brandname[brand_id]}\n'
              f'probability: {top3_brand_id_to_proba[brand_id]}\n'
              f'------------------------\n'
        )

In [12]:
# test function for title
predict_top3_brands_for_title(['Rimmel'])

brandid: 8092907
brandname: Rimmel London
probability: 0.997
------------------------

brandid: 14538509
brandname: MijnBaard
probability: 0.002
------------------------

brandid: 14865608
brandname: Courrèges
probability: 0.001
------------------------



In [13]:
# save trained model to disk
dump(beauty_brand_model, 'beauty_brand_model.joblib', compress=True)

['beauty_brand_model.joblib']

In [14]:
# save dictionary that translates brand_ids to brandnames
dump(brand_id_to_brandname, 'brand_id_to_brandname.joblib')

['brand_id_to_brandname.joblib']