In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv


In [2]:
! pip install mlflow

Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/ae/dc/406977e8cd30d970af90612a49b756d5701465ef93b998b0337fd77bf7c5/mlflow-2.9.2-py3-none-any.whl.metadata
  Downloading mlflow-2.9.2-py3-none-any.whl.metadata (13 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Obtaining dependency information for databricks-cli<1,>=0.8.7 from https://files.pythonhosted.org/packages/ae/a3/d56f8382c40899301f327d1c881278b09c9b8bc301c2c111633a0346d06e/databricks_cli-0.18.0-py2.py3-none-any.whl.metadata
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn<22 (from mlflow)
  Obtaining dependency information for gunicorn<22 from https://files.pythonhosted.org/packages/0e/2a/c3a878eccb100ccddf45c50b6b8db8cf3301a6adede6e31d48e8531cab13/gunicorn-21.2.0-py3-none-any.whl.metadata
  Do

In [3]:
import pandas as pd
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from datasets import Dataset, DatasetDict
from datasets.features import Value, ClassLabel
from datasets import Features
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments
from transformers import Trainer
from transformers import DataCollatorWithPadding



In [4]:
data_path = "/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv"

In [5]:
# Load the CSV file. The file has no headers so pass header=None
df = pd.read_csv(data_path, header=None)
# Add column name to data frame
df.columns = ['category', 'text']

In [6]:
# Create label map
label2idx = {label:i for i, label in enumerate(df.category.unique().tolist())}
label2idx

{'Household': 0, 'Books': 1, 'Clothing & Accessories': 2, 'Electronics': 3}

In [7]:
# Reverse label map
idx2label = {v:k for k,v in label2idx.items()}
idx2label

{0: 'Household', 1: 'Books', 2: 'Clothing & Accessories', 3: 'Electronics'}

In [8]:
# Create a new column with integer mapping to classes.
df['label'] = df.category.map(label2idx)

In [9]:
# Drop the NaN row.
df.dropna(inplace=True)
# Reset index
df.reset_index(drop=True, inplace=True)
# Drop the category column as we already have label map with us.
df.drop(columns=['category'], inplace=True)

In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, stratify=df['label'], random_state=42)
train_df, eval_df = train_test_split(train_df, test_size=0.2, shuffle=True, stratify=train_df['label'], random_state=42)

In [11]:
# First create Features for the dataset. It is optional though, but is best for documentation and serves as metadata for your dataset.
# We have two features, first is "text" which is type string and the other is our target "label".
features=Features({"text": Value(dtype='string', id=None), 
                "label": ClassLabel(num_classes=4, 
                                    names=['Household', 'Books', 'Clothing & Accessories', 'Electronics'], id=None)})

In [12]:
# Use from pandas to direclty convert from pandas dataframe to dataset.
train_dataset = Dataset.from_pandas(train_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)
eval_dataset = Dataset.from_pandas(eval_df, features=features)

In [13]:
# Create a dataset dict combining all the datasets under one.
dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "validation": eval_dataset})

In [14]:
# Model checkpoint that we will be using
model_checkpoint = "distilbert-base-uncased"
# Load tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

In [16]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

  0%|          | 0/33 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 32271
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10085
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8068
    })
})

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Models

#### 1. Using Hugging face standard classifier by freezing base model 

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
## freezing base model
for param in model.base_model.parameters():
    param.requires_grad = False

In [20]:
training_args = TrainingArguments(output_dir="/kaggle/working/outputs",
                                 per_device_train_batch_size=32,
                                 per_device_eval_batch_size=16,
                                 learning_rate=5e-5,
                                 num_train_epochs=5,
                                 optim='adamw_torch_fused',
                                 logging_dir="/kaggle/working/logs/",
                                 logging_strategy='steps',
                                 logging_steps=200,
                                 evaluation_strategy='epoch',
                                 save_strategy='epoch',
                                 save_total_limit=2,
                                 load_best_model_at_end=True,
                                 report_to="mlflow")

In [21]:
trainer = Trainer(model=model,
                 args=training_args,
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets['validation'],
                 data_collator=data_collator,
                 tokenizer=tokenizer)

In [22]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.4155,0.323789
2,0.2939,0.285148
3,0.2815,0.269265
4,0.2752,0.263426
5,0.2694,0.262464




TrainOutput(global_step=2525, training_loss=0.3418268743836054, metrics={'train_runtime': 2020.0104, 'train_samples_per_second': 79.878, 'train_steps_per_second': 1.25, 'total_flos': 2.1065266604829384e+16, 'train_loss': 0.3418268743836054, 'epoch': 5.0})

In [23]:
predictions = trainer.predict(tokenized_datasets["test"])
predictions.predictions.shape



(10085, 4)

In [24]:
preds = np.argmax(predictions.predictions, axis=-1)
test_df['preds'] = preds

In [25]:
print(classification_report(test_df.label, test_df.preds, target_names=list(idx2label.values())))

                        precision    recall  f1-score   support

             Household       0.91      0.93      0.92      3863
                 Books       0.96      0.93      0.95      2364
Clothing & Accessories       0.93      0.94      0.94      1734
           Electronics       0.91      0.90      0.91      2124

              accuracy                           0.93     10085
             macro avg       0.93      0.93      0.93     10085
          weighted avg       0.93      0.93      0.93     10085



#### 2. Using Pretrained base model and custom classifier

In [26]:
! pip install mlflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [27]:
new_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [28]:
## Load model configuration
config = AutoConfig.from_pretrained(model_checkpoint)

## adding custom classifier with 2 linear layers
num_classes = 4
config.num_labels = num_classes
config.hidden_dropout_prob = 0.2

model1 = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,config=config)

## Replacing the classifier head with custom clasifier
custom_classifier = torch.nn.Sequential(
        torch.nn.Linear(config.hidden_size, config.hidden_size),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        #torch.nn.Linear(config.hidden_size, config.hidden_size),
        #torch.nn.ReLU(),
        #torch.nn.Dropout(0.2),
        torch.nn.Linear(config.hidden_size, num_classes)
        )

model1.classifier = custom_classifier


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
trainer1 = Trainer(model=model1,
                 args=training_args,
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets['validation'],
                 data_collator=data_collator,
                 tokenizer=new_tokenizer)

In [30]:
trainer1.train()



Epoch,Training Loss,Validation Loss
1,0.1592,0.144232
2,0.0775,0.104102
3,0.0403,0.097647
4,0.0186,0.0928
5,0.012,0.10179




TrainOutput(global_step=2525, training_loss=0.0759739323889855, metrics={'train_runtime': 4338.5763, 'train_samples_per_second': 37.191, 'train_steps_per_second': 0.582, 'total_flos': 2.135377020269383e+16, 'train_loss': 0.0759739323889855, 'epoch': 5.0})

In [31]:
predictions1 = trainer1.predict(tokenized_datasets["test"])
predictions1.predictions.shape



(10085, 4)

In [32]:
preds1 = np.argmax(predictions.predictions, axis=-1)
test_df['new_preds'] = preds1

In [33]:
print(classification_report(test_df.label, test_df.new_preds, target_names=list(idx2label.values())))

                        precision    recall  f1-score   support

             Household       0.91      0.93      0.92      3863
                 Books       0.96      0.93      0.95      2364
Clothing & Accessories       0.93      0.94      0.94      1734
           Electronics       0.91      0.90      0.91      2124

              accuracy                           0.93     10085
             macro avg       0.93      0.93      0.93     10085
          weighted avg       0.93      0.93      0.93     10085

