<a href="https://colab.research.google.com/github/Tesfahun03/Amharic-NLP-for-E-commerce-Integration/blob/main/fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers datasets seqeval

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.6 MB/s[0m eta [3

In [4]:
from google.colab import drive
import os


In [5]:
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
working_dir = '/content/drive/My Drive/fine_tune_conll'

In [7]:
os.makedirs(working_dir, exist_ok=True)

In [8]:
def read_conll(file_path):
    sentences = []
    tags = []
    with open(file_path, 'r') as f:
        sentence = []
        tag_sequence = []
        for line in f:
            line = line.strip()
            if not line:  # Sentence boundary
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_sequence)
                    sentence = []
                    tag_sequence = []
            else:
                word, tag = line.split()
                sentence.append(word)
                tag_sequence.append(tag)
        # Add the last sentence if file doesn't end with a blank line
        if sentence:
            sentences.append(sentence)
            tags.append(tag_sequence)
    return sentences, tags

In [9]:
conll_file_path = os.path.join(working_dir, 'labeled_dataset_final.conll')

In [10]:

sentences, tags = read_conll(conll_file_path)

In [11]:
print(f"Loaded {len(sentences)} sentences.")


Loaded 15777 sentences.


In [12]:
#Preprocess Data
from transformers import AutoTokenizer

In [13]:
model_name = 'bert-base-cased'  # Use a cased model for tasks like NER
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [14]:
label_list = sorted(set(tag for seq in tags for tag in seq))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [15]:
def tokenize_and_align_labels(sentence, tag_sequence):
    tokenized_inputs = tokenizer(
        sentence,
        truncation=True,
        padding="max_length",  # Ensure consistent length
        max_length=128,       # Adjust as per your model/input size
        is_split_into_words=True,
    )
    word_ids = tokenized_inputs.word_ids()  # Map tokens to word IDs
    aligned_labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:  # Special tokens
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:  # New word
            aligned_labels.append(label_to_id[tag_sequence[word_idx]])
        else:  # Subword token
            aligned_labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [16]:
# Prepare dataset
from datasets import Dataset


In [17]:
data = []
for sentence, tag_sequence in zip(sentences, tags):
    data.append(tokenize_and_align_labels(sentence, tag_sequence))


In [18]:

dataset = Dataset.from_dict({key: [d[key] for d in data] for key in data[0]})

In [19]:
#Fine-Tune the Model
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer


In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list)
)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir=os.path.join(working_dir, 'results'),
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch", # Add this line to match evaluation strategy
    fp16=True,
)



In [22]:
# Define the Trainer
!pip install evaluate
from evaluate import load



Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [23]:

metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  #
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [25]:
# Train
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Loc,Price,Product,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.1186,0.010988,"{'precision': 0.8822826808228268, 'recall': 0.8869913275517012, 'f1': 0.8846307385229542, 'number': 7495}","{'precision': 0.9930638129211257, 'recall': 0.9720659553831231, 'f1': 0.9824527007156161, 'number': 5155}","{'precision': 0.9821475625823451, 'recall': 0.9813071809385902, 'f1': 0.9817271919138709, 'number': 15193}",0.957026,0.954208,0.955615,0.996306
2,0.0135,0.004768,"{'precision': 0.9537602976348658, 'recall': 0.9577051367578385, 'f1': 0.9557286465614807, 'number': 7495}","{'precision': 0.9949504758205476, 'recall': 0.9937924345295829, 'f1': 0.9943711180124222, 'number': 5155}","{'precision': 0.9955283750904189, 'recall': 0.9964457315869151, 'f1': 0.9959868421052632, 'number': 15193}",0.984147,0.985526,0.984836,0.998662
3,0.007,0.00247,"{'precision': 0.9816, 'recall': 0.9822548365577052, 'f1': 0.9819273091030344, 'number': 7495}","{'precision': 0.9970873786407767, 'recall': 0.9961202715809894, 'f1': 0.9966035904900533, 'number': 5155}","{'precision': 0.9987494240768775, 'recall': 0.9987494240768775, 'f1': 0.9987494240768775, 'number': 15193}",0.993823,0.993823,0.993823,0.999372


Trainer is attempting to log a value of "{'precision': 0.8822826808228268, 'recall': 0.8869913275517012, 'f1': 0.8846307385229542, 'number': 7495}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9930638129211257, 'recall': 0.9720659553831231, 'f1': 0.9824527007156161, 'number': 5155}" of type <class 'dict'> for key "eval/PRICE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9821475625823451, 'recall': 0.9813071809385902, 'f1': 0.9817271919138709, 'number': 15193}" of type <class 'dict'> for key "eval/PRODUCT" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9537602976348658, 'recall': 0.957705136

TrainOutput(global_step=2961, training_loss=0.029686303526837453, metrics={'train_runtime': 633.0948, 'train_samples_per_second': 74.761, 'train_steps_per_second': 4.677, 'total_flos': 3091998964817664.0, 'train_loss': 0.029686303526837453, 'epoch': 3.0})

In [26]:
# Step 5: Save the Model
model.save_pretrained(os.path.join(working_dir, 'fine_tuned_model'))
tokenizer.save_pretrained(os.path.join(working_dir, 'fine_tuned_model'))

print(f"Model saved to {working_dir}/fine_tuned_model")

Model saved to /content/drive/My Drive/fine_tune_conll/fine_tuned_model


In [27]:
!pip install shap lime matplotlib pandas


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=f8dd8deb3a9237966c226aea489200848a7f22adec4b151992be148e94d1782f
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [28]:
import shap
import lime
from lime.lime_text import LimeTextExplainer
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline

In [29]:
# Load the Fine-Tuned Model
from transformers import AutoModelForTokenClassification, AutoTokenizer


In [30]:
model_path = os.path.join(working_dir, 'fine_tuned_model')
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [31]:
# Load model into a Hugging Face pipeline for NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


Device set to use cuda:0


In [32]:
def shap_interpret(text):
    explainer = shap.Explainer(ner_pipeline)  # Initialize SHAP explainer
    shap_values = explainer([text])  # Get SHAP values for the text
    shap.plots.text(shap_values[0])  # Visualize SHAP explanation


In [33]:
# Step 4: Define LIME Interpretability
def lime_interpret(text):
    explainer = LimeTextExplainer(class_names=label_list)  # Initialize LIME explainer
    explanation = explainer.explain_instance(
        text,
        lambda x: np.array([e['score'] for e in ner_pipeline(x)]),  # Model prediction probabilities
        num_features=10
    )
    explanation.show_in_notebook()
