# NAMED ENTITY RECOGNITION:

1. The named entities are pre-defined categories chosen according to the use case such as names of people, organizations, places, codes, time notations, monetary values, etc.

1. NER aims to assign a class to each token (usually a single word) in a sequence. Because of this, NER is also referred to as token classification.

In [2]:
# !pip install simpletransformers

In [3]:
import pandas as pd

# 1. Read the assignment data
# We set skip_blank_lines=False to detect the empty rows that separate sentences
df = pd.read_csv("assignement_data.csv", delimiter=";", encoding="latin1", skip_blank_lines=False)

# 2. Generate 'Sentence #' column
# The file uses empty rows to separate sentences. We iterate through to assign IDs.
sentence_ids = []
current_sentence = 1

for i, row in df.iterrows():
    # Check if the row is a sentence break (empty data)
    if pd.isna(row['data']):
        current_sentence += 1
        sentence_ids.append(None)  # Mark separator row for removal
    else:
        sentence_ids.append(f"Sentence: {current_sentence}")

df['Sentence #'] = sentence_ids

# 3. Clean up the DataFrame
# Drop the separator rows (where data was NaN)
df = df.dropna(subset=['data'])

# Rename columns to match standard NER dataset conventions
# 'data' -> 'Word', 'label' -> 'Tag'
df = df.rename(columns={"data": "Word", "label": "Tag"})

# 4. Add a dummy POS column if your legacy code requires it
# (BERT models typically don't strictly need POS tags for training)
df['POS'] = 'UNKN'

# 5. Format for Simple Transformers (or similar libraries)
# Most libraries expect: sentence_id, words, labels
data = df[['Sentence #', 'Word', 'Tag']].copy()
data.columns = ['sentence_id', 'words', 'labels']


In [4]:
data.head(30)

Unnamed: 0,sentence_id,words,labels
0,Sentence: 1,EU,B-ORG
1,Sentence: 1,rejects,O
2,Sentence: 1,German,B-MISC
3,Sentence: 1,call,O
4,Sentence: 1,to,O
5,Sentence: 1,boycott,O
6,Sentence: 1,British,B-MISC
7,Sentence: 1,lamb,O
8,Sentence: 1,.,O
10,Sentence: 2,Peter,B-PER


In [5]:
data =data.fillna(method ="ffill")

  data =data.fillna(method ="ffill")


In [6]:
data.head(30)

Unnamed: 0,sentence_id,words,labels
0,Sentence: 1,EU,B-ORG
1,Sentence: 1,rejects,O
2,Sentence: 1,German,B-MISC
3,Sentence: 1,call,O
4,Sentence: 1,to,O
5,Sentence: 1,boycott,O
6,Sentence: 1,British,B-MISC
7,Sentence: 1,lamb,O
8,Sentence: 1,.,O
10,Sentence: 2,Peter,B-PER


In [7]:
data.keys()

Index(['sentence_id', 'words', 'labels'], dtype='object')

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
data.head(30)

Unnamed: 0,sentence_id,words,labels
0,Sentence: 1,EU,B-ORG
1,Sentence: 1,rejects,O
2,Sentence: 1,German,B-MISC
3,Sentence: 1,call,O
4,Sentence: 1,to,O
5,Sentence: 1,boycott,O
6,Sentence: 1,British,B-MISC
7,Sentence: 1,lamb,O
8,Sentence: 1,.,O
10,Sentence: 2,Peter,B-PER


In [10]:
data["labels"] = data["labels"].str.upper()

In [11]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [13]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [14]:
train_data

Unnamed: 0,sentence_id,words,labels
18678,Sentence: 1940,Wednesday,O
170608,Sentence: 17669,28/08/1996,O
52660,Sentence: 5289,(,O
63783,Sentence: 6623,were,O
28122,Sentence: 2786,2,O
...,...,...,...
132742,Sentence: 13931,remembering,O
126456,Sentence: 13313,2,O
76966,Sentence: 8192,home,O
147182,Sentence: 15308,citing,O


# Model Training


In [15]:
from simpletransformers.ner import NERModel,NERArgs

I0000 00:00:1765009608.253204    5261 cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
label = data["labels"].unique().tolist()
label

['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [17]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32


In [18]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args, use_cuda=True)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model.train_model(train_data)

  0%|          | 0/21 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/679 [00:00<?, ?it/s]

  with amp.autocast():


(679, 0.20530332513811925)

In [20]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/21 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/493 [00:00<?, ?it/s]

  with amp.autocast():


In [21]:
result

{'eval_loss': 0.17952571031883072,
 'precision': 0.8384917517674784,
 'recall': 0.8138151875571821,
 'f1_score': 0.825969202197632}

In [22]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


In [23]:
prediction

[[{'What': 'O'},
  {'is': 'O'},
  {'the': 'O'},
  {'new': 'O'},
  {'name': 'O'},
  {'of': 'O'},
  {'Bangalore': 'B-LOC'}]]

In [24]:
from sklearn.metrics import classification_report

# Get predictions on test data
result, model_outputs, predictions = model.eval_model(test_data)

# Flatten the predictions and true labels
y_true = test_data['labels'].tolist()
y_pred = [pred for sublist in predictions for pred in sublist]

# Generate classification report
print(classification_report(y_true, y_pred))    

  0%|          | 0/21 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/493 [00:00<?, ?it/s]

  with amp.autocast():


              precision    recall  f1-score   support

       B-LOC       0.04      0.04      0.04      1450
      B-MISC       0.02      0.02      0.02       724
       B-ORG       0.04      0.04      0.04      1250
       B-PER       0.03      0.04      0.03      1297
       I-LOC       0.01      0.00      0.00       247
      I-MISC       0.00      0.00      0.00       220
       I-ORG       0.02      0.01      0.02       728
       I-PER       0.02      0.01      0.01       900
           O       0.83      0.83      0.83     32628

    accuracy                           0.69     39444
   macro avg       0.11      0.11      0.11     39444
weighted avg       0.69      0.69      0.69     39444

