## IMORTING THE LIBRARIES

In [None]:
import numpy as np
import pandas as pd
# IMPORTING THE MATRICS
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# IMPORTING THE TRANSFORMER
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
# IMPORTING THE FRAME WORK
import torch
from tqdm import tqdm
import os
os.environ["WANDB_DISABLED"] = "true"

## IMPORTING THE DATASET

In [None]:
df = pd.read_csv("../input/ecommerce-text-classification/ecommerceDataset.csv", names=["labels", "descriptions"])
df.head()


## DATA PREPROCESSING

In [None]:
descriptions = df["descriptions"].map(str).values.tolist()

In [None]:
labels = df["labels"].values.tolist()

le = LabelEncoder()
labels = le.fit_transform(labels).tolist()

## IMPORTING THE PRETRAINED MODEL
### (BERT-BASE-UNCASED)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## IMPORTING THE TOKENIZER
### (BERT-BASE-UNCASED)

In [None]:
tokenizer = BertTokenizer.from_pretrained(
        "bert-base-uncased",
        do_lower_case=True)

## SPLITTING THE DATASET

In [None]:
x_train, x_test, y_train, y_test = train_test_split(descriptions, labels, test_size=0.4, stratify=labels, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

## CONVERTING THE DATA IN TO BATCHES

In [None]:
def batch_token(tokenizer, dataset):
    return tokenizer.batch_encode_plus(dataset,
                                       max_length=256,
                                       padding=True,
                                       truncation=True,
                                       add_special_tokens=True,
                                       return_attention_mask=True,
                                       return_tensors='pt')

In [None]:
x_train_tokens = batch_token(tokenizer, x_train)
x_valid_tokens = batch_token(tokenizer, x_valid)
x_test_tokens = batch_token(tokenizer, x_test)

## DEFINING THE ENCODES AND LABELS

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodes, labels):
        self.encodings = encodes
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val
                in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = Data(x_train_tokens, y_train)
valid_dataset = Data(x_valid_tokens, y_valid)
test_dataset = Data(x_test_tokens, y_test)

## DEFINING THE TRANINGS PARAMETERS

In [None]:
args = TrainingArguments(output_dir="output",
                            evaluation_strategy="epoch",
                            metric_for_best_model="f1",
                            save_strategy="epoch",
                            num_train_epochs=3,
                            load_best_model_at_end=True
                            )

## COMPUTING THE MATRICES

In [None]:
def determine_metrics(p):
    prediction, labels = p
    preds_flat = np.argmax(prediction, axis=1).flatten()
    labels_flat = labels.flatten()
    f1 = f1_score(labels_flat, preds_flat, average='macro')
    return {"f1": f1}

In [None]:
trainer = Trainer(args=args,
                    model=model,
                    train_dataset=train_dataset,
                    eval_dataset=valid_dataset,
                    compute_metrics=determine_metrics,
                    callbacks=[EarlyStoppingCallback(
                            early_stopping_patience=3)]
                    )

## TRAINING THE MODEL ON THE PROVIDED TRAINING DATASET

In [None]:
trainer.train()

## MODEL BUILT SUCCESSFULLY

## PREDICITING THE OUTPUT

In [None]:
trainer = Trainer(model=model)
predictions = trainer.predict(test_dataset)

## CLASSIFICATION REPORT 

In [None]:
preds = np.argmax(predictions.predictions, axis=1).flatten()
true_vals = predictions.label_ids

In [None]:
print(classification_report(true_vals, preds, target_names=list(le.classes_)))