### Adding scripts to the path of the notebook

In [1]:
import os, sys

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 5\EthioMart_E-Commerce_NER\notebooks


### Import Statements

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Load the CoNLL dataset

In [3]:
from scripts.util import read_conll

In [4]:
PATH = "../data/conll.txt"

data = read_conll(PATH)
data.head()

Unnamed: 0,tokens,lables
0,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, B-Product, O, O, O..."
1,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, B-Product, O..."
4,"[ቴሌግራምtmemodernshoppingcenter, በአዲስ, ነገረ, ሁሌም,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


#### Prep the data

- Encode the NER labels

In [5]:
# define the unique labels
unique_labels = ['I-PRICE', 'B-PRICE', "B-LOCATION", "B-Product", "O"]

# now encode them to integers and create mappins between str and int and viceversa
enc_to_str = {i: value for i,value in enumerate(unique_labels)}
str_to_enc = {value: i for i,value in enumerate(unique_labels)}

In [6]:
data['lables'] = data['lables'].apply(lambda x: [str_to_enc[label] for label in x])

- Convert the data frame into a huggingface dataset

In [7]:
from datasets import Dataset, Features, Sequence, Value

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# define the features/internal structure of the dataset 
feats = Features({
    'tokens': Sequence(Value('string')),
    'lables': Sequence(Value('int32'))
})

# now convert the dataframe into a huggingface dataset  
dataset = Dataset.from_pandas(data[['tokens', 'lables']], features=feats)

- Tokenize and align the lables for each of the models

    The models are **mBert**, **bert-tiny-amharic**, **DistilBert**

In [9]:
from scripts.tokenizer import Tokenizer

- Load the tokenizers using custom class

In [10]:
bert_tiny_tokenizer = Tokenizer(model_name='rasyosef/bert-tiny-amharic')
bert_tiny_tokenizer.load_tokenizer()

In [11]:
mbert_tokenizer = Tokenizer(model_name='bert-base-multilingual-cased')
mbert_tokenizer.load_tokenizer()



In [12]:
distil_bert_tokenizer = Tokenizer(model_name='distilbert-base-multilingual-cased')
distil_bert_tokenizer.load_tokenizer()

- Tokenize the dataset

In [13]:
tokenized_bert_tiny = dataset.map(bert_tiny_tokenizer.tokenize_and_align, batched=True)
tokenized_mbert = dataset.map(mbert_tokenizer.tokenize_and_align, batched=True)
tokenized_distil_bert = dataset.map(distil_bert_tokenizer.tokenize_and_align, batched=True)

Map: 100%|██████████| 2161/2161 [00:00<00:00, 2324.75 examples/s]
Map: 100%|██████████| 2161/2161 [00:00<00:00, 2545.01 examples/s]
Map: 100%|██████████| 2161/2161 [00:00<00:00, 2612.06 examples/s]


- Split the datasets into training and testing sets

In [14]:
train_test_bert_tiny = tokenized_bert_tiny.train_test_split(test_size=0.1)
train_test_mbert = tokenized_mbert.train_test_split(test_size=0.1)
train_test_distill_bert = tokenized_distil_bert.train_test_split(test_size=0.1)

#### Start Training

In [15]:
from transformers import TrainingArguments

- Set training arguments

In [16]:
training_args = TrainingArguments(
    output_dir='./training_result',
    eval_strategy="epoch",     # Evaluates at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,
    weight_decay=0.01,               # Strength of weight decay
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',            # Directory for storing logs
    logging_strategy="steps",        # Log at regular intervals
    logging_steps=50,                # Log every 50 steps
    save_strategy="epoch",           # Save model at the end of each epoch
    report_to="none",                # Only show logs in the output (no TensorBoard)
    use_cpu=True,  # Force training to happen on CPU,
    load_best_model_at_end=True,     # Load the best model (based on metric) at the end
    metric_for_best_model="eval_loss",# Metric used to determine the best model
    save_total_limit=1,              # Only keep the best model, delete the others  
)

- Load and finetune the models

In [17]:
from transformers import AutoModelForTokenClassification

In [18]:
bert_tiny_model = AutoModelForTokenClassification.from_pretrained("rasyosef/bert-tiny-amharic", num_labels=len(unique_labels))
mbert_model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(unique_labels))
distil_bert_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(unique_labels))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifie