In [3]:
from transformers import pipeline

# Load ALBERT for NER
ner_pipeline = pipeline("ner", model="albert-base-v2", grouped_entities=True)

# Test it out
example = "Apple Inc. was founded by Steve Jobs in California."
print(ner_pipeline(example))


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'LABEL_1', 'score': 0.6795958, 'word': 'apple inc. was founded', 'start': 0, 'end': 22}, {'entity_group': 'LABEL_0', 'score': 0.80376387, 'word': 'by', 'start': 23, 'end': 25}, {'entity_group': 'LABEL_1', 'score': 0.68518716, 'word': 'steve jobs in california.', 'start': 26, 'end': 51}]




In [4]:
from transformers import pipeline

# Load MobileBERT for NER
ner_pipeline = pipeline("ner", model="google/mobilebert-uncased", grouped_entities=True)

# Test it out
example = "Apple Inc. was founded by Steve Jobs in California."
print(ner_pipeline(example))


config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'LABEL_0', 'score': 0.8570779, 'word': 'apple inc. was founded by steve jobs in california', 'start': 0, 'end': 50}, {'entity_group': 'LABEL_1', 'score': 1.0, 'word': '.', 'start': 50, 'end': 51}]


# **Albert fine tuning**

**Custom Datset Generation**

In [19]:
import random
import json

# Define product names
mobiles = [f"SonoiKush Series {i}" for i in range(1, 6)]
laptops = [f"SonoiKush Shell Series {i}" for i in range(1, 4)]
products = mobiles + laptops

# Define action verbs
actions = [
    "bought", "buy", "purchased", "costed", "cost",
    "priced", "price", "sold", "sell", "dead", "died",
    "repaired", "repairing", "repair", "break", "broke",
    "fell", "fallen", "fall"
]


# Define sentence templates
templates = [
    "I {action} a {product} last week.",
    "They have {action} the new {product}.",
    "The {product} was {action} yesterday.",
    "We need to {action} our {product} soon.",
    "Have you {action} the latest {product}?",
    "The company {action} several {product}.",
    "Customers often {action} the {product}.",
    "The {product} needs to be {action}.",
    "I plan to {action} another {product}.",
    "They decided to {action} the {product}."
]

def generate_sentence(template, products, actions):
    action = random.choice(actions)
    product = random.choice(products)
    sentence = template.format(action=action, product=product)
    return sentence, action, product

def tokenize_and_label(sentence, action, product):
    tokens = sentence.split()
    labels = ["O"] * len(tokens)

    # Find indices for product
    product_tokens = product.split()
    action_tokens = action.split()

    # Label Product
    for i in range(len(tokens)):
        if tokens[i:i+len(product_tokens)] == product_tokens:
            labels[i] = "B-Product"
            for j in range(1, len(product_tokens)):
                labels[i + j] = "I-Product"
            break

    # Label Action
    for i in range(len(tokens)):
        if tokens[i:i+len(action_tokens)] == action_tokens:
            labels[i] = "B-Action"
            for j in range(1, len(action_tokens)):
                labels[i + j] = "I-Action"
            break

    return tokens, labels

# Initialize lists
data = []

# Generate 1000 examples
for _ in range(1000):
    template = random.choice(templates)
    sentence, action, product = generate_sentence(template, products, actions)
    tokens, labels = tokenize_and_label(sentence, action, product)
    data.append({
        "tokens": tokens,
        "labels": labels
    })

# Shuffle the data
random.shuffle(data)

# Split into training and testing
train_data = data[:800]
test_data = data[800:]

# Save to JSON files
with open('train.json', 'w') as f:
    for entry in train_data:
        json.dump(entry, f)
        f.write('\n')

with open('test.json', 'w') as f:
    for entry in test_data:
        json.dump(entry, f)
        f.write('\n')

print("Dataset generation complete. 'train.json' and 'test.json' files created.")


Dataset generation complete. 'train.json' and 'test.json' files created.


In [20]:
import json
import random
from transformers import pipeline, AlbertTokenizer, AlbertForSequenceClassification

# Load the test data
def load_json(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

test_data = load_json('test.json')

# Take a sample of 10 examples from the test set
sampled_test_data = random.sample(test_data, 10)

# Extract reviews from the sampled data
test_reviews = [item['tokens'] for item in sampled_test_data] # Changed from 'review' to 'tokens' as the key does not exist in the json file
test_labels = [item['labels'] for item in sampled_test_data] # Changed from 'label' to 'labels' as the key does not exist in the json file

# Display the sampled reviews
for idx, review in enumerate(test_reviews):
    print(f"Sample {idx+1}: {review}")

Sample 1: ['They', 'have', 'sold', 'the', 'new', 'SonoiKush', 'Series', '2.']
Sample 2: ['The', 'company', 'repaired', 'several', 'SonoiKush', 'Series', '5.']
Sample 3: ['I', 'plan', 'to', 'buy', 'another', 'SonoiKush', 'Series', '2.']
Sample 4: ['Customers', 'often', 'broke', 'the', 'SonoiKush', 'Series', '1.']
Sample 5: ['The', 'SonoiKush', 'Shell', 'Series', '3', 'was', 'died', 'yesterday.']
Sample 6: ['The', 'company', 'purchased', 'several', 'SonoiKush', 'Series', '3.']
Sample 7: ['I', 'broke', 'a', 'SonoiKush', 'Series', '5', 'last', 'week.']
Sample 8: ['The', 'SonoiKush', 'Shell', 'Series', '3', 'needs', 'to', 'be', 'purchased.']
Sample 9: ['We', 'need', 'to', 'repairing', 'our', 'SonoiKush', 'Shell', 'Series', '1', 'soon.']
Sample 10: ['The', 'company', 'fall', 'several', 'SonoiKush', 'Shell', 'Series', '3.']


**base model evaluation 1**

In [21]:
# Initialize the pipeline for text classification
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels based on your data

# Load the pipeline for text classification
nlp_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Get predictions for the sampled reviews
predictions = nlp_pipeline([" ".join(review) for review in test_reviews]) # Join the tokens back into a string for each review

# Display the predictions along with the true labels
for idx, (review, pred, true_label) in enumerate(zip(test_reviews, predictions, test_labels)):
    print(f"Review {idx+1}: {review}")
    print(f"Predicted: {pred['label']} | True Label: {true_label}\n")

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Review 1: ['They', 'have', 'sold', 'the', 'new', 'SonoiKush', 'Series', '2.']
Predicted: LABEL_1 | True Label: ['O', 'O', 'B-Action', 'O', 'O', 'O', 'O', 'O']

Review 2: ['The', 'company', 'repaired', 'several', 'SonoiKush', 'Series', '5.']
Predicted: LABEL_1 | True Label: ['O', 'O', 'B-Action', 'O', 'O', 'O', 'O']

Review 3: ['I', 'plan', 'to', 'buy', 'another', 'SonoiKush', 'Series', '2.']
Predicted: LABEL_1 | True Label: ['O', 'O', 'O', 'B-Action', 'O', 'O', 'O', 'O']

Review 4: ['Customers', 'often', 'broke', 'the', 'SonoiKush', 'Series', '1.']
Predicted: LABEL_1 | True Label: ['O', 'O', 'B-Action', 'O', 'O', 'O', 'O']

Review 5: ['The', 'SonoiKush', 'Shell', 'Series', '3', 'was', 'died', 'yesterday.']
Predicted: LABEL_1 | True Label: ['O', 'B-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'B-Action', 'O']

Review 6: ['The', 'company', 'purchased', 'several', 'SonoiKush', 'Series', '3.']
Predicted: LABEL_1 | True Label: ['O', 'O', 'B-Action', 'O', 'O', 'O', 'O']

Review 7: [

**Base model eval 2 on samples from generated data**

In [22]:
from transformers import pipeline

# Load the base ALBERT model for NER
base_ner = pipeline("ner", model="albert-base-v2", tokenizer="albert-base-v2", grouped_entities=True)

# Example sentence
example_sentence = "I bought a SonoiKush Series 3 last week."

# Run the base model
base_results = base_ner(example_sentence)
print("Base Model Results:")
print(base_results)


Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Base Model Results:
[{'entity_group': 'LABEL_1', 'score': 0.5803801, 'word': 'i bought', 'start': 0, 'end': 8}, {'entity_group': 'LABEL_0', 'score': 0.53740066, 'word': 'a', 'start': 9, 'end': 10}, {'entity_group': 'LABEL_1', 'score': 0.6413606, 'word': 'so', 'start': 11, 'end': 13}, {'entity_group': 'LABEL_0', 'score': 0.6197999, 'word': 'noikush', 'start': 13, 'end': 20}, {'entity_group': 'LABEL_1', 'score': 0.57229215, 'word': 'series', 'start': 21, 'end': 27}, {'entity_group': 'LABEL_0', 'score': 0.6045866, 'word': '3 last week.', 'start': 28, 'end': 40}]


In [12]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

**Tokenized data**

In [51]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset('json', data_files={'train': 'train.json', 'test': 'test.json'})

# Define label list
label_list = ["O", "B-Product", "I-Product", "B-Action", "I-Action"]

# Create a mapping from labels to IDs
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

label_all_tokens = False  # Only label the first token of each word

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)




In [52]:
from transformers import AlbertForTokenClassification

# Initialize the model
model = AlbertForTokenClassification.from_pretrained(
    "albert-base-v2",
    num_labels=len(label_list)
)


Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Freezing model layers except the last 6**

In [53]:
# Freeze all layers
for param in model.albert.parameters():
    param.requires_grad = False

# Unfreeze the last 2 transformer layers
for param in model.albert.encoder.albert_layer_groups[-6:].parameters(): # Access layers through albert_layer_groups
    param.requires_grad = True

In [4]:
!pip install datasets --upgrade



In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [6]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=dcca0fbc1e2af2af3d217f059b92cbbd5d365f5ad18f8eaf3552808f367ed17a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


**metrics to display while training**

In [54]:
import numpy as np
from evaluate import load

# Load the metric
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert IDs to labels
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Check if 'overall_precision' exists in results
    if 'overall_precision' in results:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    else:
        # Handle the case where 'overall_precision' is not found
        # This could involve returning default values, raising an exception, or investigating further
        return {
            "precision": 0.0,
            "recall": 0.0,
            "f1": 0.0,
            "accuracy": 0.0,
        }

**Training args**

In [55]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision
    gradient_accumulation_steps=4,  # Accumulate gradients
)



# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
)

# Fine-tune the model
trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.007102,1.0,1.0,1.0,1.0
2,No log,0.000366,1.0,1.0,1.0,1.0
3,No log,0.000188,1.0,1.0,1.0,1.0
4,No log,0.000143,1.0,1.0,1.0,1.0
5,No log,0.00012,1.0,1.0,1.0,1.0
6,No log,0.000105,1.0,1.0,1.0,1.0
7,No log,9.6e-05,1.0,1.0,1.0,1.0
8,No log,9e-05,1.0,1.0,1.0,1.0
9,No log,8.6e-05,1.0,1.0,1.0,1.0
10,No log,8.5e-05,1.0,1.0,1.0,1.0


TrainOutput(global_step=250, training_loss=0.031442253112792966, metrics={'train_runtime': 43.2764, 'train_samples_per_second': 184.858, 'train_steps_per_second': 5.777, 'total_flos': 5362563605760.0, 'train_loss': 0.031442253112792966, 'epoch': 10.0})

**testing fine tuned model on sample**

In [56]:
# Load the fine-tuned model
from transformers import pipeline
fine_tuned_model = trainer.model
fine_tuned_ner = pipeline("ner", model=fine_tuned_model, tokenizer=tokenizer, grouped_entities=True)

example_sentence1 = "I bought a SonoiKush Series 3 last week."

# Run the fine-tuned model
fine_tuned_results = fine_tuned_ner(example_sentence1)
print("\nFine-Tuned Model Results:")
print(fine_tuned_results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Fine-Tuned Model Results:
[{'entity_group': 'LABEL_0', 'score': 0.9999305, 'word': 'i', 'start': 0, 'end': 1}, {'entity_group': 'LABEL_3', 'score': 0.99988437, 'word': 'bought', 'start': 2, 'end': 8}, {'entity_group': 'LABEL_0', 'score': 0.9998876, 'word': 'a', 'start': 9, 'end': 10}, {'entity_group': 'LABEL_1', 'score': 0.97231996, 'word': 'sono', 'start': 11, 'end': 15}, {'entity_group': 'LABEL_2', 'score': 0.8382233, 'word': 'ikush series 3', 'start': 15, 'end': 29}, {'entity_group': 'LABEL_0', 'score': 0.999057, 'word': 'last week.', 'start': 30, 'end': 40}]




In [57]:
# Load the test data
import json # import the json module

def load_json(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

test_data = load_json('test.json')

**test the model on all 200 test sample**

In [58]:
from transformers import pipeline

# Initialize the fine-tuned NER pipeline, ensure you're using GPU if available
fine_tuned_ner = pipeline("ner", model=fine_tuned_model, tokenizer=tokenizer, grouped_entities=True)


# Extract reviews from the test data
test_reviews = [item['tokens'] for item in test_data]

# Process the reviews in batches (adjust batch size as needed)
batch_size = 32  # You can adjust this depending on your system's memory
fine_tuned_results = []

for i in range(0, len(test_reviews), batch_size):
    batch = test_reviews[i:i + batch_size]
    # Join the tokens back into sentences for the NER pipeline
    batch_sentences = [" ".join(tokens) for tokens in batch]
    fine_tuned_results.extend(fine_tuned_ner(batch_sentences))

# Now `fine_tuned_results` will contain the predictions for the entire test set

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [59]:
# Define your label mapping
label_mapping = {
    'LABEL_0': 'Product',
    'LABEL_1': 'Other',
    'LABEL_2': 'Date',  # Example for a new label if needed
    'LABEL_3': 'Action',  # Add or adjust as necessary
    # Include other label mappings as required
}

# Convert model output to human-readable labels
def map_entities(entities, label_mapping):
    return [{'word': ent['word'], 'entity': label_mapping.get(ent['entity_group'], 'Unknown')} for ent in entities]

# Prepare predictions
predicted_labels = [map_entities(result, label_mapping) for result in fine_tuned_results]


**exporting the predicted output along with test samples and ground truth**

In [60]:
import pandas as pd

# Prepare data for DataFrame
data_for_csv = {
    'Review': test_reviews,
    'True Label': [item['labels'] for item in test_data],
    'Model Prediction': [', '.join([f"{ent['word']} ({ent['entity']})" for ent in pred]) for pred in predicted_labels]
}

# Create DataFrame
df = pd.DataFrame(data_for_csv)

# Save DataFrame to CSV
df.to_csv('test_results_with_predictions4C.csv', index=False)

print("CSV file created successfully!")


CSV file created successfully!
