### Adding scripts to the path of the notebook

In [1]:
import os, sys

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 5\EthioMart_E-Commerce_NER\notebooks


### Import Statements

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Load the CoNLL dataset

In [3]:
from scripts.util import read_conll

In [4]:
PATH = "../data/conll.txt"

data = read_conll(PATH)
data.head()

Unnamed: 0,tokens,lables
0,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, B-Product, O, O, O..."
1,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, B-Product, O..."
4,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


Prep the data

- Encode the NER labels

In [5]:
# define the unique labels
unique_labels = ['I-PRICE', 'B-PRICE', "B-LOCATION", "B-Product", "O"]

# now encode them to integers and create mappins between str and int and viceversa
enc_to_str = {i: value for i,value in enumerate(unique_labels)}
str_to_enc = {value: i for i,value in enumerate(unique_labels)}

In [10]:
data['lables'] = data['lables'].apply(lambda x: [str_to_enc[label] for label in x])

- Convert the data frame into a huggingface dataset

In [6]:
from datasets import Dataset, Features, Sequence, Value

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# define the features/internal structure of the dataset 
feats = Features({
    'tokens': Sequence(Value('string')),
    'lables': Sequence(Value('int32'))
})

# now convert the dataframe into a huggingface dataset  
dataset = Dataset.from_pandas(data[['tokens', 'lables']], features=feats)

- Tokenize and align the lables for each of the models

    The models are **mBert**, **bert-tiny-amharic**, **DistilBert**

In [13]:
from transformers import AutoTokenizer, Trainer, AutoModelForTokenClassification, PreTrainedTokenizer
from scripts.tokenizer import Tokenizer

- Load the tokenizers using custom class

In [14]:
bert_tiny_tokenizer = Tokenizer(model_name='rasyosef/bert-tiny-amharic')
bert_tiny_tokenizer.load_tokenizer()

In [15]:
mbert_tokenizer = Tokenizer(model_name='bert-base-multilingual-cased')
mbert_tokenizer.load_tokenizer()



In [16]:
distil_bert_tokenizer = Tokenizer(model_name='distilbert-base-multilingual-cased')
distil_bert_tokenizer.load_tokenizer()

- Tokenize the dataset

In [17]:
tokenized_bert_tiny = dataset.map(bert_tiny_tokenizer.tokenize_and_align, batched=True)
tokenized_mbert = dataset.map(mbert_tokenizer.tokenize_and_align, batched=True)
tokenized_distil_bert = dataset.map(distil_bert_tokenizer.tokenize_and_align, batched=True)

Map: 100%|██████████| 2161/2161 [00:00<00:00, 2375.11 examples/s]
Map: 100%|██████████| 2161/2161 [00:00<00:00, 2687.87 examples/s]
Map: 100%|██████████| 2161/2161 [00:00<00:00, 2907.23 examples/s]


- Split the datasets into training and testing sets

In [19]:
train_test_bert_tiny = tokenized_bert_tiny.train_test_split(test_size=0.1)
train_test_mbert = tokenized_mbert.train_test_split(test_size=0.1)
train_test_distill_bert = tokenized_distil_bert.train_test_split(test_size=0.1)