## Setup

In [1]:
!pip install datasets transformers -q

[K     |████████████████████████████████| 194kB 24.5MB/s 
[K     |████████████████████████████████| 2.2MB 41.9MB/s 
[K     |████████████████████████████████| 245kB 57.4MB/s 
[K     |████████████████████████████████| 112kB 42.4MB/s 
[K     |████████████████████████████████| 3.3MB 46.5MB/s 
[K     |████████████████████████████████| 870kB 52.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
# Used to save the model to drive
# from google.colab import drive
# drive.mount('/content/drive')

The 2nd model is a pretrained model which takes a bit of time to train,<br> if If `INFER` is True, It will load the 2nd model from drive.<br>
If `TRAIN` is True, It will train the 2nd model here.

In [3]:
TRAIN = False
INFER = True

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import collections
import transformers
import datasets
import torch
import nltk
import re

from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from collections import defaultdict
from IPython.display import HTML


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Download dataset
!gdown https://drive.google.com/uc?id=1YdCJU5ZxjAmNY3mbMTGf4-qo_I8evd5h
!gdown https://drive.google.com/uc?id=1Kat_WLKADObIUX4OMGoOelJsTYkL4Fx_
!gdown https://drive.google.com/uc?id=1eee8YItHSUoy6alZxp92ywlLZy-d2k8k

Downloading...
From: https://drive.google.com/uc?id=1YdCJU5ZxjAmNY3mbMTGf4-qo_I8evd5h
To: /content/processed_data.csv
11.3MB [00:00, 24.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Kat_WLKADObIUX4OMGoOelJsTYkL4Fx_
To: /content/data.csv
38.1MB [00:00, 53.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1eee8YItHSUoy6alZxp92ywlLZy-d2k8k
To: /content/below_thresh_index.txt
100% 2.00k/2.00k [00:00<00:00, 3.04MB/s]


In [6]:
df = pd.read_csv('processed_data.csv', index_col = 0)
df.head()

Unnamed: 0,description,category_1,name,product_spec
0,Wash Water Style Alisha Bleach Lycra Solid Luk...,Clothing,Alisha Solid Women Cycling Shorts,Number of Contents in Sales Package is Pack of...
1,Covered Price effective Leatherette Seater bed...,Furniture,FabHomeDecor Fabric Double Sofa Bed,Installation & Demo Details is Installation an...
2,stains Price bags One de Lifestyle Bellies dir...,Footwear,AW Bellies,"Ideal For is Women, Occasion is Casual, Color ..."
3,Wash Water Style Alisha Bleach Lycra Solid Luk...,Clothing,Alisha Solid Women Cycling Shorts,Number of Contents in Sales Package is Pack of...
4,Fashion All Liquid Pet Type Factor Arnica Box ...,Pet Supplies,Sicons All Purpose Arnica Dog Shampoo,"Pet Type is Dog, Brand is Sicons, Quantity is ..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19055 entries, 0 to 19054
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   description   19055 non-null  object
 1   category_1    19055 non-null  object
 2   name          19055 non-null  object
 3   product_spec  19055 non-null  object
dtypes: object(4)
memory usage: 744.3+ KB


In [8]:
SEED = 42

In [9]:
df = df.sample(frac = 1, random_state = SEED)

In [10]:
df.category_1.value_counts()

Clothing                           5905
Jewellery                          3526
Footwear                           1186
Mobiles Accessories                1098
Automotive                         1012
Home Decor Festive Needs            878
Beauty And Personal Care            698
Home Furnishing                     690
Computers                           572
Kitchen Dining                      554
Watches                             530
Baby Care                           393
Tools Hardware                      391
Toys School Supplies                328
Pens Stationery                     313
Bags Wallets Belts                  264
Furniture                           179
Sports Fitness                      164
Home Improvement                     81
Cameras Accessories                  80
Health Personal Care Appliances      43
Sunglasses                           40
Gaming                               32
Pet Supplies                         30
Home Kitchen                         24


# Modelling

In [11]:
le = LabelEncoder()
df['category_1'] = le.fit_transform(df['category_1'])

Since we have an imbalanced dataset, we need to startify and split

In [12]:
train, test_and_val = train_test_split(df, test_size = 0.2, random_state = SEED, stratify = df['category_1'])
test, val = train_test_split(test_and_val, test_size = 0.5, random_state = SEED, stratify = test_and_val['category_1'])

In [13]:
# Separate input & target
train_x, train_y = train.drop(columns = ['category_1']), train['category_1']
val_x, val_y = val.drop(columns = ['category_1']), val['category_1']
test_x, test_y = test.drop(columns = ['category_1']), test['category_1']

## TfidfVectorizer


In [14]:
# Create features using Tfidf
transformer = FeatureUnion([
                ('description_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['description'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('name_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['name'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('product_spec_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['product_spec'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))]) 

In [15]:
train_vectorized_x = transformer.fit_transform(train_x)
val_vectorized_x = transformer.transform(val_x)
test_vectorized_x = transformer.transform(test_x)

In [16]:
train_vectorized_x.shape

(15244, 47109)

In [17]:
clf = RandomForestClassifier(random_state=SEED, class_weight = 'balanced')
clf.fit(train_vectorized_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [18]:
y_pred = clf.predict(val_vectorized_x)
y_true = val_y
# weighted due to imbalance
print(f1_score(y_true, y_pred, average = 'weighted'))

0.9764039919995291


Seems like a good score, let's predict on a category in the test set having less sample and see how it performs.

In [19]:
for i, class_ in enumerate(le.classes_):
    print(f'{i} - {class_}')

0 - Automotive
1 - Baby Care
2 - Bags Wallets Belts
3 - Beauty And Personal Care
4 - Cameras Accessories
5 - Clothing
6 - Computers
7 - Ebooks
8 - Eyewear
9 - Footwear
10 - Furniture
11 - Gaming
12 - Health Personal Care Appliances
13 - Home Decor Festive Needs
14 - Home Entertainment
15 - Home Furnishing
16 - Home Improvement
17 - Home Kitchen
18 - Jewellery
19 - Kitchen Dining
20 - Mobiles Accessories
21 - Pens Stationery
22 - Pet Supplies
23 - Sports Fitness
24 - Sunglasses
25 - Tools Hardware
26 - Toys School Supplies
27 - Watches


In [20]:
train.category_1.value_counts()

5     4724
18    2821
9      949
20     878
0      810
13     702
3      558
15     552
6      458
19     443
27     424
1      314
25     313
26     262
21     251
2      211
10     143
23     131
16      65
4       64
12      35
24      32
11      26
22      24
17      19
14      15
7       12
8        8
Name: category_1, dtype: int64

In [21]:
# Ebooks
EXAMPLE_CATEGORY = 7
test[test['category_1'] == EXAMPLE_CATEGORY].head(1)

Unnamed: 0,description,category_1,name,product_spec
17453,cash Replacement Guarantee Only Coursebook Buy...,7,Fast Italian Elisabeth Smith Coursebook,"Publisher is Hodder and Stoughton, Publication..."


In [22]:
ebook_sample = val[val['category_1'] == EXAMPLE_CATEGORY].head(1).drop(columns = 'category_1')
clf.predict(transformer.transform(ebook_sample))

array([7])

It predicted the correct category.<br> It did pretty well considering it had only 12 examples to train with for that particular category.

## PreTrained Model<br>
We will only use the `product_spec` feature since all other features don't have a semantic order to perform a sequence classification.

For training we have a lot of examples 15k, it will take very long to train. To cap the dataset, we will sample across the target variable using the `cap` parameter to determine how many samples will each target variable have.

In [23]:
def to_cat(df, cap = 10, return_cls_weights = False):
    """
    Function which caps the dataset according to the `cap` parameter.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe to be capped
    cap : int
        Number of samples to sample for each category.
        min(available_samples, cap) is sampled for each category.
    return_cls_weights : boolean
        Class weights according to distribution to update weights(default - `False`)

    Returns
    -------
    if `return_cls_weights == False`:
        capped_df : pandas.core.frame.DataFrame
            Capped dataframe
    else:
        capped_df : pandas.core.frame.DataFrame
             Capped dataframe
        class_weights : collections.OrderedDict
            weight of class i

    """
    all_categories = df.category_1.unique()
    capped_df = pd.DataFrame(columns = df.columns)
    
    class_weights = dict()
    total_samples = len(df)
    total_classes = len(all_categories)

    for cat in all_categories:
        all_cat_samples = df[df.category_1 == cat]
        total_cat_samples = len(all_cat_samples)
        capped_cat = all_cat_samples.sample(min(total_cat_samples, cap),  random_state = SEED)
        # total samples of that category in the new data
        sample_len = len(capped_cat)
        capped_df = capped_df.append(capped_cat, ignore_index = True)
        class_weights[cat] = total_samples/(total_classes*sample_len)

    class_weights = collections.OrderedDict(sorted(class_weights.items()))
    if return_cls_weights:
        return capped_df, class_weights
    return capped_df

# Max 20 samples per category
CAP = 20
capped_train, class_weights = to_cat(train, cap = CAP, return_cls_weights = True)
capped_val = to_cat(val, cap = CAP)

Intialize some variables.

In [24]:
MODEL_CHECKPOINT = "distilbert-base-uncased"
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 30
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
MAX_LEN = 512
CHECKPOINT_DIR = 'pretrained-model/'
LEARNING_RATE = 2e-5
LOGGING_STEPS = 10
SAVE_TOTAL_LIMIT = 3
NUM_LABELS = len(le.classes_)
METRIC = 'f1'
PATIENCE = 2
metric = datasets.load_metric('f1')
# Save & Load path
MODEL2_PATH = '/content/drive/MyDrive/MIDAS/Models/Pretrained-bert/'

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1905.0, style=ProgressStyle(description…




## Dataset Tokenization

In [26]:
def make_dataset(df, val = False):
    """
    Converts a dataset to input taken by 🤗 Transformers

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe to be capped
    val : boolean
        Dataset for evaluation/training (default False)
    Returns
    -------
    dataset : 🤗 Dataset
        Tokenized 🤗 Dataset

    """
    def tokenize(examples):
        """Tokenized a Dataset"""
        return TOKENIZER(examples['sentence'], truncation=True)

    sentences = df.product_spec
    if not val:
        labels = df.category_1
        data_dict = {
            'id' : list(range(len(df))),
            'sentence' : sentences,
            'label' : labels
        }
    else:
        data_dict = {
            'id' : list(range(len(df))),
            'sentence' : sentences,
        }
    dataset = datasets.Dataset.from_dict(data_dict)
    return dataset.map(tokenize, batched = True)

encoded_train = make_dataset(capped_train)
encoded_val = make_dataset(capped_val)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




We need to code up a `CustomTrainer` to take class distribution into account.<br>
For the metric we need to use weighted F1 score, to account for imbalance.

In [27]:
print(f'Total samples used for training - {len(encoded_train)} with a cap of {CAP}.')

Total samples used for training - 534 with a cap of 20.


In [32]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)

class CustomTrainer(transformers.Trainer):
    def compute_loss(self, model, inputs, return_outputs = False):
        """Overide with custom weighted loss"""
        labels = inputs.pop("labels")
        weights = torch.tensor(list(class_weights.values())).cuda()

        outputs = model(**inputs)
        logits = outputs.logits

        loss_func = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_func(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    """Helper metrics function"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average = 'weighted')

args = transformers.TrainingArguments(
        CHECKPOINT_DIR,
        overwrite_output_dir = True,
        evaluation_strategy = "epoch",
        learning_rate= LEARNING_RATE,
        per_device_train_batch_size = TRAIN_BATCH_SIZE,
        per_device_eval_batch_size = VALID_BATCH_SIZE,
        num_train_epochs= EPOCHS,
        seed = SEED,
        load_best_model_at_end = True,
        logging_steps = LOGGING_STEPS, 
        save_total_limit = SAVE_TOTAL_LIMIT,
        metric_for_best_model = METRIC
    )
trainer = CustomTrainer(
        model,
        args,
        train_dataset= encoded_train,
        eval_dataset= encoded_val,
        tokenizer= TOKENIZER,
        compute_metrics= compute_metrics,
        callbacks = [transformers.EarlyStoppingCallback(PATIENCE)]
    )

if TRAIN:
    trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [33]:
# To save the model
# model.save_pretrained(MODEL_PATH_2)

In [34]:
ebook_sample

Unnamed: 0,description,name,product_spec
17462,cash Replacement Kernphysik Guarantee die hrun...,Einf hrung die Kernphysik,"Publisher is Wiley, Publication Date is 2015-0..."


In [35]:
if TRAIN:
    out = trainer.predict(sample)
    print(le.inverse_transform(np.argmax(out.predictions, axis = 1)))

In [39]:
# Tokenize the ebook sample
sample = make_dataset(ebook_sample, val = True)

if INFER:
    # Download model from drive
    import gdown
    import os
    url = 'https://drive.google.com/uc?id=1-3BVfpYTmMNq4PWIf0-RoqFEa7N0CO5-'
    path = '/content/model/pretrained-model/'
    if not os.path.exists(path):
        os.makedirs(path)
    gdown.download(url, path + 'pytorch_model.bin', quiet=True)
    gdown.download('https://drive.google.com/uc?id=1-HeZWu_-fsGD4Di6gXrr6NygJCmlqlqi', path + 'config.json', quiet=True)

    model2 = transformers.DistilBertForSequenceClassification.from_pretrained(path, local_files_only=True)
    collator = transformers.DataCollatorWithPadding(TOKENIZER)
    trainer2 = transformers.Trainer(model2, data_collator  = collator, compute_metrics= compute_metrics)
    # predict on the ebook sample
    out = trainer2.predict(sample)
    print(le.inverse_transform(np.argmax(out.predictions, axis = 1)))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




['Ebooks']


In [41]:
trainer2.evaluate(encoded_val)

{'eval_f1': 0.8970872704281935,
 'eval_loss': 0.493320494890213,
 'eval_mem_cpu_alloc_delta': 380928,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 289526784,
 'eval_runtime': 2.3227,
 'eval_samples_per_second': 167.905}

## Inference on Data without targets

Let's use both of our models to predict on the samples which we removed in the preproc notebook which didn't have a feasible target variable and see what the model outputs.

In [42]:
raw_data = pd.read_csv('data.csv')
with open("below_thresh_index.txt", "r") as fp:
        below_thresh_index = json.load(fp)
REQ_COLS = ['description', 'product_name', 'product_specifications']
below_thresh = raw_data.iloc[below_thresh_index][REQ_COLS]
below_thresh.head()

Unnamed: 0,description,product_name,product_specifications
5,Key Features of Eternal Gandhi Super Series Cr...,Eternal Gandhi Super Series Crystal Paper Weig...,"{""product_specification""=>[{""key""=>""Model Name..."
30,Key Features of Bengal Blooms Rose Artificial ...,Bengal Blooms Rose Artificial Plant with Pot,"{""product_specification""=>[{""key""=>""Brand"", ""v..."
44,Specifications of Vishudh Printed Women's Stra...,Vishudh Printed Women's Straight Kurta,"{""product_specification""=>[{""key""=>""Sleeve"", ""..."
45,Specifications of Vishudh Printed Women's Stra...,Vishudh Printed Women's Straight Kurta,"{""product_specification""=>[{""key""=>""Sleeve"", ""..."
46,Specifications of Vishudh Printed Women's Anar...,Vishudh Printed Women's Anarkali Kurta,"{""product_specification""=>[{""key""=>""Sleeve"", ""..."


Apply the same steps from the previous notebook to all these feature variables.

In [43]:
all_words = defaultdict(lambda:1)
stopwords =  set(nltk.corpus.stopwords.words('english'))
for word in stopwords:
    all_words[word] = 0

def clean_desc(s):
    """
    Cleans the description
    & removes the stopwords

    Parameters
    ----------
    s : string
        A single string
    
    Returns
    -------
    s : string
        Joined tokens 
    """
    s = re.sub(r'[^a-zA-Z\s]+',' ', s)
    # remove one letter words
    s = re.sub(r'\b\w\b',' ', s)
    # clean
    s = re.sub(r'\s+',' ', s).strip()
    word_tokens = {token for token in s.split(' ') if all_words[token]}
    if not word_tokens:
        return s
    return ' '.join(word_tokens)

tokens = []
for s in below_thresh['product_name'].values:
    # keep only letters & spaces
    s = re.sub(r'[^a-zA-Z\s]+',' ', s)
    # remove one letter words
    s = re.sub(r'\b\w\b',' ', s)
    # clean
    s = re.sub(r'\s+',' ', s).strip()
    word_tokens = [token for token in s.split(' ') if all_words[token]]
    if not word_tokens:
        tokens.append(s)
    else:
        tokens.append(' '.join(word_tokens))

below_thresh.loc[:, ['name']] = tokens
below_thresh.drop(columns = ['product_name'], inplace = True)

def clean_product_spec(s):
    """Cleans the variable 
    & joins to have a semantic meaning

    Parameters
    ----------
    s : string
        A single product spec
    
    Returns
    -------
    parsed : string
        parsed string
    """
    if pd.isna(s):
        return np.nan
    pattern = re.compile(r'"key"=>"(.*?)".*?"value"=>"(.*?)"')
    matches = pattern.finditer(s)
    parsed = ""
    for match in matches:
        key = match.group(1)
        value = match.group(2)
        parsed += f"{key} is {value}, "
    parsed = parsed.strip()[:-1]
    if parsed:
        return parsed
    return np.nan

below_thresh['product_spec'] = below_thresh['product_specifications'].apply(clean_product_spec)
below_thresh.drop(columns = ['product_specifications'], inplace = True)
below_thresh = below_thresh[below_thresh['product_spec'].notna()].copy(deep = True)
below_thresh.reset_index(inplace = True, drop = True)

below_thresh.head()

Unnamed: 0,description,name,product_spec
0,Key Features of Eternal Gandhi Super Series Cr...,Eternal Gandhi Super Series Crystal Paper Weig...,"Model Name is Gandhi Paper Weight Mark V, Weig..."
1,Key Features of Bengal Blooms Rose Artificial ...,Bengal Blooms Rose Artificial Plant Pot,"Brand is Bengal Blooms, Model Number is BBAJC2..."
2,Specifications of Vishudh Printed Women's Stra...,Vishudh Printed Women Straight Kurta,"Sleeve is Sleeveless, Number of Contents in Sa..."
3,Specifications of Vishudh Printed Women's Stra...,Vishudh Printed Women Straight Kurta,"Sleeve is Sleeveless, Number of Contents in Sa..."
4,Specifications of Vishudh Printed Women's Anar...,Vishudh Printed Women Anarkali Kurta,"Sleeve is Half Sleeve, Number of Contents in S..."


In [44]:
rf_clf_predictions = clf.predict(transformer.transform(below_thresh))
bert_pred = np.argmax(trainer2.predict(make_dataset(below_thresh.reset_index(drop = True), val = True)).predictions, axis = 1)

rf_bert_df = pd.DataFrame(columns = ['product_name', 'predicted_category_rf_clf', 'predicted_category_bert'])
for idx in below_thresh.index:
    rf_bert_df = rf_bert_df.append({'product_name' : below_thresh.iloc[idx]['name'],
                                'predicted_category_rf_clf' : le.inverse_transform([rf_clf_predictions[idx]])[0],
                                'predicted_category_bert' : le.inverse_transform([bert_pred[idx]])[0]}, ignore_index = True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




It would be interesting to compare the predictions of Random Forest and the Bert Model.

In [45]:
# Result is not truncated so we can compare
# Check out the first 10 predictions
HTML(rf_bert_df.head(10).to_html())

Unnamed: 0,product_name,predicted_category_rf_clf,predicted_category_bert
0,Eternal Gandhi Super Series Crystal Paper Weights Silver Finish,Pens Stationery,Pens Stationery
1,Bengal Blooms Rose Artificial Plant Pot,Home Decor Festive Needs,Home Decor Festive Needs
2,Vishudh Printed Women Straight Kurta,Clothing,Clothing
3,Vishudh Printed Women Straight Kurta,Clothing,Clothing
4,Vishudh Printed Women Anarkali Kurta,Clothing,Clothing
5,BuildTrack PIR Wireless Motion Sensor One Switch Control Wireless Sensor Security System,Automotive,Home Improvement
6,Skayvon SUMMERSIBLE SINGLE PHASE PUMP CONTROLLER Wired Sensor Security System,Automotive,Home Improvement
7,MASARA Solid Women Straight Kurta,Clothing,Clothing
8,Vishudh Printed Women Straight Kurta,Clothing,Clothing
9,Skayvon SUBMERSIBBLE THREE PHASE PUMP CONTROLLER Wired Sensor Security System,Automotive,Home Improvement


It is worth noting that the Random Forest was trained on ~15k examples and Bert model just on 20% of that.

In [46]:
print(f'The two models don"t agree on {int((sum(rf_bert_df.predicted_category_rf_clf != rf_bert_df.predicted_category_bert)/len(rf_bert_df)) * 100)}% samples')

The two models don"t agree on 24% samples
