In [48]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [49]:
df =  pd.read_csv('company-document-text.csv')
df

Unnamed: 0,text,label,word_count
0,order id 10718 shipping details ship name k...,ShippingOrder,120
1,invoice order id 10707 customer id arout ord...,invoice,66
2,order id 10448 shipping details ship name r...,ShippingOrder,96
3,invoice order id 11068 customer id queen ord...,invoice,68
4,order id 10656 shipping details ship name g...,ShippingOrder,109
...,...,...,...
2671,order id 10326 shipping details ship name b...,ShippingOrder,111
2672,purchase orders order id order date customer n...,purchase Order,39
2673,invoice order id 10460 customer id folko ord...,invoice,59
2674,stock report for 2018-01 category meat poult...,report,46


In [50]:
nltk.download("stopwords")
stop_words =  set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)#remove special charactrers
    text = re.sub(r"\s+", " ",text)#remove extra whitespace
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

In [52]:
df['Cleaned Text'] = df["text"].apply(clean_text)

In [53]:
df

Unnamed: 0,text,label,word_count,Cleaned Text
0,order id 10718 shipping details ship name k...,ShippingOrder,120,order id 10718 ship detail ship name königlich...
1,invoice order id 10707 customer id arout ord...,invoice,66,invoic order id 10707 custom id arout order da...
2,order id 10448 shipping details ship name r...,ShippingOrder,96,order id 10448 ship detail ship name rancho gr...
3,invoice order id 11068 customer id queen ord...,invoice,68,invoic order id 11068 custom id queen order da...
4,order id 10656 shipping details ship name g...,ShippingOrder,109,order id 10656 ship detail ship name great lak...
...,...,...,...,...
2671,order id 10326 shipping details ship name b...,ShippingOrder,111,order id 10326 ship detail ship name bólido co...
2672,purchase orders order id order date customer n...,purchase Order,39,purchas order order id order date custom name ...
2673,invoice order id 10460 customer id folko ord...,invoice,59,invoic order id 10460 custom id folko order da...
2674,stock report for 2018-01 category meat poult...,report,46,stock report 201801 categori meat poultri id c...


LABEL ENCODING

In [55]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label']= label_encoder.fit_transform(df['label'])


from datasets import Dataset

dataset = Dataset.from_pandas(df[["Cleaned Text", "label"]])

In [56]:
dataset

Dataset({
    features: ['Cleaned Text', 'label'],
    num_rows: 2676
})

In [57]:
split_dataset = dataset.train_test_split(test_size=0.2, seed = 42)
train_texts = split_dataset['train']
test_texts = split_dataset['test']

Tokenization With Bert

In [73]:

from transformers import AutoTokenizer

# Initialize tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["Cleaned Text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
train_dataset = train_texts.map(tokenize_function, batched=True)
test_dataset = test_texts.map(tokenize_function, batched=True)


# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

In [75]:
train_dataset[0]

{'label': tensor(2),
 'input_ids': tensor([  101, 16405, 11140,  3022,  2344,  2344,  8909,  2344,  3058,  7661,
          2171,  8746, 11387,  2418,  2692, 20958,  2683,  6285,  2050, 15214,
         21007,  5054,  4031,  4031,  8909,  4031, 24110,  3775,  3775,  3131,
          3976,  2484, 19739, 20486,  2050, 10392,  2050,  1022,  1018,  1019,
          5187,  9300,  2627,  2072,  1019,  3590,  1022,  3931,  1015,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))