In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
df =  pd.read_csv('/company-document-text.csv')
df

Unnamed: 0,text,label,word_count
0,order id 10718 shipping details ship name k...,ShippingOrder,120
1,invoice order id 10707 customer id arout ord...,invoice,66
2,order id 10448 shipping details ship name r...,ShippingOrder,96
3,invoice order id 11068 customer id queen ord...,invoice,68
4,order id 10656 shipping details ship name g...,ShippingOrder,109
...,...,...,...
2671,order id 10326 shipping details ship name b...,ShippingOrder,111
2672,purchase orders order id order date customer n...,purchase Order,39
2673,invoice order id 10460 customer id folko ord...,invoice,59
2674,stock report for 2018-01 category meat poult...,report,46


In [3]:
nltk.download("stopwords")
stop_words =  set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)#remove special charactrers
    text = re.sub(r"\s+", " ",text)#remove extra whitespace
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

In [5]:
df['Cleaned Text'] = df["text"].apply(clean_text)

In [6]:
df

Unnamed: 0,text,label,word_count,Cleaned Text
0,order id 10718 shipping details ship name k...,ShippingOrder,120,order id 10718 ship detail ship name königlich...
1,invoice order id 10707 customer id arout ord...,invoice,66,invoic order id 10707 custom id arout order da...
2,order id 10448 shipping details ship name r...,ShippingOrder,96,order id 10448 ship detail ship name rancho gr...
3,invoice order id 11068 customer id queen ord...,invoice,68,invoic order id 11068 custom id queen order da...
4,order id 10656 shipping details ship name g...,ShippingOrder,109,order id 10656 ship detail ship name great lak...
...,...,...,...,...
2671,order id 10326 shipping details ship name b...,ShippingOrder,111,order id 10326 ship detail ship name bólido co...
2672,purchase orders order id order date customer n...,purchase Order,39,purchas order order id order date custom name ...
2673,invoice order id 10460 customer id folko ord...,invoice,59,invoic order id 10460 custom id folko order da...
2674,stock report for 2018-01 category meat poult...,report,46,stock report 201801 categori meat poultri id c...


In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

LABEL ENCODING

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label']= label_encoder.fit_transform(df['label'])


from datasets import Dataset

dataset = Dataset.from_pandas(df[["Cleaned Text", "label"]])

In [10]:
dataset

Dataset({
    features: ['Cleaned Text', 'label'],
    num_rows: 2676
})

In [11]:
split_dataset = dataset.train_test_split(test_size=0.2, seed = 42)
train_texts = split_dataset['train']
test_texts = split_dataset['test']

Tokenization With Bert

In [12]:
from transformers import AutoTokenizer

# Initialize tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["Cleaned Text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
train_dataset = train_texts.map(tokenize_function, batched=True)
test_dataset = test_texts.map(tokenize_function, batched=True)


# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

In [13]:
train_dataset[0]

{'label': tensor(2),
 'input_ids': tensor([  101, 16405, 11140,  3022,  2344,  2344,  8909,  2344,  3058,  7661,
          2171,  8746, 11387,  2418,  2692, 20958,  2683,  6285,  2050, 15214,
         21007,  5054,  4031,  4031,  8909,  4031, 24110,  3775,  3775,  3131,
          3976,  2484, 19739, 20486,  2050, 10392,  2050,  1022,  1018,  1019,
          5187,  9300,  2627,  2072,  1019,  3590,  1022,  3931,  1015,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.005002,1.0,1.0,1.0,1.0
2,No log,0.002418,1.0,1.0,1.0,1.0
3,No log,0.001998,1.0,1.0,1.0,1.0


TrainOutput(global_step=402, training_loss=0.07239803390123357, metrics={'train_runtime': 255.5829, 'train_samples_per_second': 25.119, 'train_steps_per_second': 1.573, 'total_flos': 422300827054080.0, 'train_loss': 0.07239803390123357, 'epoch': 3.0})

In [16]:
from google.colab import drive
import pickle


# Save the model and tokenizer
model_save_path = "/content/drive/My Drive/bert_company_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/bert_company_model/tokenizer_config.json',
 '/content/drive/My Drive/bert_company_model/special_tokens_map.json',
 '/content/drive/My Drive/bert_company_model/vocab.txt',
 '/content/drive/My Drive/bert_company_model/added_tokens.json',
 '/content/drive/My Drive/bert_company_model/tokenizer.json')

In [17]:
# Save the label encoder
label_encoder_path = "/content/drive/My Drive/label_encoder.pkl"
pickle.dump(label_encoder, open(label_encoder_path, 'wb'))

print("Model, tokenizer, and label encoder saved to Google Drive.")

Model, tokenizer, and label encoder saved to Google Drive.


In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load fine-tuned model and tokenizer
model_name = "/content/drive/My Drive/bert_company_model"  # Update with your saved model path if necessary
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


# Function for prediction
def predict(text):
    # Clean the input text (if you have a cleaning function)
    text = clean_text(text)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Convert scalar to array-like and return inverse transform
    return label_encoder.inverse_transform([predicted_class])[0]  # Return the predicted class label

# Example usage
input_text = "invoice order id  10707 customer id  arout order date  2017-10-16 customer details  contact name  thomas hardy address  120 hanover sq. city  london postal code  wa1 1dp country  uk phone   171  555-7788 fax   171  555-6750 product details  product id product name quantity unit price 55 pâté chinois 21 24 0 57 ravioli angelo 40 19 5 70 outback lager 28 15 0 totalprice 1704 0 page 1"
print(predict(input_text))

1


In [19]:
# Example usage
input_text = "order id  10345 shipping details  ship name  quick-stop ship address  taucherstraße 10 ship city  cunewalde ship region  western europe ship postal code  1307 ship country  germany customer details  customer id  quick customer name  quick-stop employee details  employee name  andrew fuller shipper details  shipper id  2 shipper name  united package order details  order date  2016-11-04 shipped date  2016-11-11 products  -------------------------------------------------------------------------------------------------- product  northwoods cranberry sauce quantity  70 unit price  32 0 total  2240 0 -------------------------------------------------------------------------------------------------- product  teatime chocolate biscuits quantity  80 unit price  7 3 total  584 0 -------------------------------------------------------------------------------------------------- product  singaporean hokkien fried mee quantity  9 unit price  11 2 total  100 8 total price  total price  2924 8"
print(predict(input_text))

0


In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
