In [None]:
!pip install simpletransformers

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel, NERArgs

# Read the dataset
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

# Encode the sentence IDs and rename the columns
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"])
data.rename(columns={"Sentence #": "sentence_id", "Word": "words", "Tag": "labels"}, inplace=True)

# Convert labels to uppercase
data["labels"] = data["labels"].str.upper()

# Split the data into training and testing sets
X = data[["sentence_id", "words"]]
Y = data["labels"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Create the train and test dataframes
train_data = pd.DataFrame({"sentence_id": x_train["sentence_id"], "words": x_train["words"], "labels": y_train})
test_data = pd.DataFrame({"sentence_id": x_test["sentence_id"], "words": x_test["words"], "labels": y_test})

# Get the unique labels
label = data["labels"].unique().tolist()

# Set the NER model training arguments
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-5
args.overwrite_output_dir = True
args.train_batch_size = 64
args.eval_batch_size = 64

# Create and train the NER model
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)
model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

# Evaluate the model on the test data
result, model_outputs, preds_list = model.eval_model(test_data)

# Predict NER labels for a sample sentence
prediction, model_output = model.predict(["Apple Inc. is an American multinational technology company headquartered in Cupertino, California. Apple is the world's largest technology company by revenue, with US$394.3 billion in 2022 revenue. As of March 2023, Apple is the world's biggest company by market capitalization. As of June 2022, Apple is the fourth-largest personal computer vendor by unit sales and the second-largest mobile phone manufacturer in the world. It is often considered as one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Meta Platforms, and Microsoft. Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve Jobs and Ronald Wayne to develop and sell Wozniak's Apple I personal computer. It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first mass-produced microcomputers. Apple went public in 1980 to instant financial success. The company developed computers featuring innovative graphical user interfaces, including the 1984 original Macintosh, announced that year in a critically acclaimed advertisement called \"1984\". By 1985, the high cost of its products, and power struggles between executives, caused problems. Wozniak stepped back from Apple and pursued other ventures, while Jobs resigned and founded NeXT, taking some Apple employees with him."])
print(prediction)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cl

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/750 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/750 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/750 [00:00<?, ?it/s]

  return [


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/731 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'Apple': 'B-ORG'}, {'Inc.': 'I-ORG'}, {'is': 'O'}, {'an': 'O'}, {'American': 'B-GPE'}, {'multinational': 'O'}, {'technology': 'O'}, {'company': 'O'}, {'headquartered': 'O'}, {'in': 'O'}, {'Cupertino,': 'B-GEO'}, {'California.': 'B-GEO'}, {'Apple': 'B-ORG'}, {'is': 'O'}, {'the': 'O'}, {"world's": 'O'}, {'largest': 'O'}, {'technology': 'O'}, {'company': 'O'}, {'by': 'O'}, {'revenue,': 'O'}, {'with': 'O'}, {'US$394.3': 'B-ORG'}, {'billion': 'O'}, {'in': 'O'}, {'2022': 'B-TIM'}, {'revenue.': 'O'}, {'As': 'O'}, {'of': 'O'}, {'March': 'B-TIM'}, {'2023,': 'B-TIM'}, {'Apple': 'B-ORG'}, {'is': 'O'}, {'the': 'O'}, {"world's": 'O'}, {'biggest': 'O'}, {'company': 'O'}, {'by': 'O'}, {'market': 'O'}, {'capitalization.': 'O'}, {'As': 'O'}, {'of': 'O'}, {'June': 'B-TIM'}, {'2022,': 'B-TIM'}, {'Apple': 'B-ORG'}, {'is': 'O'}, {'the': 'O'}, {'fourth-largest': 'O'}, {'personal': 'O'}, {'computer': 'O'}, {'vendor': 'O'}, {'by': 'O'}, {'unit': 'O'}, {'sales': 'O'}, {'and': 'O'}, {'the': 'O'}, {'second-la

In [6]:
result

{'eval_loss': 0.18194544580847882,
 'precision': 0.8193080473331714,
 'recall': 0.7578967642526965,
 'f1_score': 0.7874068268605446}