Un Supervised Model

In [None]:
!pip install simpletransformers

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel, NERArgs

# Read the dataset
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

# Encode the sentence IDs and rename the columns
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"])
data.rename(columns={"Sentence #": "sentence_id", "Word": "words", "Tag": "labels"}, inplace=True)

# Convert labels to uppercase
data["labels"] = data["labels"].str.upper()

# Split the data into training and testing sets
X = data[["sentence_id", "words"]]
Y = data["labels"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Create the train and test dataframes
train_data = pd.DataFrame({"sentence_id": x_train["sentence_id"], "words": x_train["words"], "labels": y_train})
test_data = pd.DataFrame({"sentence_id": x_test["sentence_id"], "words": x_test["words"], "labels": y_test})

# Get the unique labels
label = data["labels"].unique().tolist()

# Set the NER model training arguments
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-5
args.overwrite_output_dir = True
args.train_batch_size = 64
args.eval_batch_size = 64

# Create and train the NER model
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)
model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

# Evaluate the model on the test data
result, model_outputs, preds_list = model.eval_model(test_data)

# Predict NER labels for a sample sentence
prediction, model_output = model.predict(['''Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
                                             Apple is the world's largest technology company by revenue, with US$394.3 billion in 2022 revenue. As of March 2023,
                                             Apple is the world's biggest company by market capitalization. As of June 2022, Apple is the fourth-largest personal
                                             computer vendor by unit sales and the second-largest mobile phone manufacturer in the world. It is often considered as
                                             one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon,
                                             Meta Platforms, and Microsoft. Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve
                                             Jobs and Ronald Wayne to develop and sell Wozniak's Apple I personal computer. It was incorporated by Jobs and Wozniak
                                             as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first
                                             mass-produced microcomputers. Apple went public in 1980 to instant financial success. The company developed computers
                                             featuring innovative graphical user interfaces, including the 1984 original Macintosh, announced that year in a critically
                                             acclaimed advertisement called \"1984\". By 1985, the high cost of its products, and power struggles between executives,
                                             caused problems. Wozniak stepped back from Apple and pursued other ventures, while Jobs resigned and founded NeXT, taking
                                             some Apple employees with him.'''])
print(prediction)


In [None]:
result

Suprevised Model

In [None]:
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

# Encode the sentence IDs and rename the columns
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"])
data.rename(columns={"Sentence #": "sentence_id", "Word": "words", "Tag": "labels"}, inplace=True)

# Convert labels to uppercase
data["labels"] = data["labels"].str.upper()
data

Unnamed: 0,sentence_id,words,POS,labels
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
...,...,...,...,...
72701,2555,to,TO,O
72702,2555,prevent,VB,O
72703,2555,fraud,NN,O
72704,2555,.,.,O


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
# Group the words and labels by sentence_id and join them with spaces
grouped_data = data.groupby("sentence_id").agg({"words": " ".join, "labels": " ".join}).reset_index()

# Prepare the training data in spaCy format
train_data = []
for row in grouped_data.itertuples():
    words = row.words.split()
    labels = row.labels.split()
    entities = []
    start = 0
    for word, label in zip(words, labels):
        end = start + len(word)
        entities.append((start, end, label))
        start = end + 1
    train_data.append((row.words, {"entities": entities}))

# print(train_data)
train_data


[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
  {'entities': [(0, 9, 'O'),
    (10, 12, 'O'),
    (13, 26, 'O'),
    (27, 31, 'O'),
    (32, 39, 'O'),
    (40, 47, 'O'),
    (48, 54, 'B-GEO'),
    (55, 57, 'O'),
    (58, 65, 'O'),
    (66, 69, 'O'),
    (70, 73, 'O'),
    (74, 76, 'O'),
    (77, 81, 'B-GEO'),
    (82, 85, 'O'),
    (86, 92, 'O'),
    (93, 96, 'O'),
    (97, 107, 'O'),
    (108, 110, 'O'),
    (111, 118, 'B-GPE'),
    (119, 125, 'O'),
    (126, 130, 'O'),
    (131, 135, 'O'),
    (136, 143, 'O'),
    (144, 145, 'O')]}),
 ('Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .',
  {'entities': [(0, 7, 'B-GPE'),
    (8, 17, 'O'),
    (18, 21, 'O'),
    (22, 26, 'O'),
    (27, 33, 'O'),
    (34, 36, 'O'),
    (37, 40, 'O'),
    (41, 47, 'O'),
    (48, 50, 'O'),
    (51, 5

In [15]:
import spacy
import random
from spacy.training.example import Example

# Define the training data

# Define the function to train the model
def train_spacy(train_data, model=None, output_dir=None, n_iter=10):
    if model is not None:
        nlp = spacy.load(model)  # Load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # Create a blank spaCy model
        print("Created blank 'en' model")

    # Add the entity labels to the ner pipe
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            nlp.vocab.strings.add(ent[2])

    # Disable unnecessary pipeline components
    disable_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    # Initialize the ner pipeline and disable other pipelines
    with nlp.disable_pipes(*disable_pipes):
        # Create blank entity recognizer and add it to the pipeline
        ner = nlp.add_pipe('ner')

    # Add labels to the entity recognizer
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # Get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    # Only train the ner pipeline
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}

            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], losses=losses, sgd=optimizer)

            print('Iteration:', itn, 'Losses:', losses)

    # Save the trained model to the output directory
    if output_dir is not None:
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

# Train the model
train_spacy(train_data, output_dir='/content/model', n_iter=10)


Created blank 'en' model




Iteration: 0 Losses: {'ner': 8133.904320652947}
Iteration: 1 Losses: {'ner': 5664.938125947069}
Iteration: 2 Losses: {'ner': 4868.747167892601}
Iteration: 3 Losses: {'ner': 4358.016084342008}
Iteration: 4 Losses: {'ner': 4001.218259958674}
Iteration: 5 Losses: {'ner': 3524.1788937006763}
Iteration: 6 Losses: {'ner': 3210.8257414149834}
Iteration: 7 Losses: {'ner': 2989.705889511479}
Iteration: 8 Losses: {'ner': 2830.4658448755386}
Iteration: 9 Losses: {'ner': 2491.644532225456}
Saved model to /content/model


In [17]:
# Load the trained model
model_path = '/content/model'  # Specify the path where the model was saved
nlp = spacy.load(model_path)

# Sample sentence for testing
sentence = '''Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
              Apple is the world's largest technology company by revenue, with US$394.3 billion in 2022 revenue. As of March 2023,
              Apple is the world's biggest company by market capitalization. As of June 2022, Apple is the fourth-largest personal
              computer vendor by unit sales and the second-largest mobile phone manufacturer in the world. It is often considered as
              one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon,
              Meta Platforms, and Microsoft. Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve
              Jobs and Ronald Wayne to develop and sell Wozniak's Apple I personal computer. It was incorporated by Jobs and Wozniak
              as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first
              mass-produced microcomputers. Apple went public in 1980 to instant financial success. The company developed computers
              featuring innovative graphical user interfaces, including the 1984 original Macintosh, announced that year in a critically
              acclaimed advertisement called \"1984\". By 1985, the high cost of its products, and power struggles between executives,
              caused problems. Wozniak stepped back from Apple and pursued other ventures, while Jobs resigned and founded NeXT, taking
              some Apple employees with him.'''

# Process the sentence with the trained model
doc = nlp(sentence)

# Print the entities recognized in the sentence
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple O
Inc. B-PER
is O
an O
American B-GPE
multinational O
technology O
company O
headquartered O
in O
Cupertino B-GEO
, O
California B-GEO
. O
Apple I-GEO
is O
the O
world O
's O
largest O
technology O
company O
by O
revenue O
, O
with O
US$ O
394.3 O
billion O
in O
2022 B-TIM
revenue O
. O
As O
of O
March B-TIM
2023 I-TIM
, O
Apple I-GEO
is O
the O
world O
's O
biggest O
company O
by O
market O
capitalization O
. O
As O
of O
June B-TIM
2022 I-TIM
, O
Apple B-GPE
is O
the O
fourth-largest O
personal O
computer O
vendor O
by O
unit O
sales O
and O
the O
second-largest B-GEO
mobile O
phone O
manufacturer O
in O
the O
world O
. O
It O
is O
often O
considered O
as O
one O
of O
the O
Big B-PER
Five O
American B-GPE
information O
technology O
companies O
, O
alongside O
Alphabet B-PER
( O
parent O
company O
of O
Google B-GEO
) O
, O
Amazon B-GEO
, O
Meta I-PER
Platforms I-PER
, O
and O
Microsoft B-ORG
. O
Apple B-PER
was O
founded O
as O
Apple B-PER
Computer I-PER
Company I-PER
on O
April 