# NAMED ENTITY RECOGNITION:

1. The named entities are pre-defined categories chosen according to the use case such as names of people, organizations, places, codes, time notations, monetary values, etc.

1. NER aims to assign a class to each token (usually a single word) in a sequence. Because of this, NER is also referred to as token classification.

In [None]:
!pip install simpletransformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/ML-Project/PII-DATA'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
data = pd.read_csv("/content/drive/My Drive/ML-Project/PII-DATA/cleandata.csv")


In [None]:
data

Unnamed: 0,document,token,label
0,7,Design,0
1,7,Thinking,0
2,7,for,0
3,7,innovation,0
4,7,reflexion,0
...,...,...,...
5807713,29433,overall,0
5807714,29433,success,0
5807715,29433,of,0
5807716,29433,the,0


In [None]:
# label_map = {
#     'O': 0,
#     'B-NAME_STUDENT': 1,
#     'I-NAME_STUDENT': 2,
#     'B-URL_PERSONAL': 3,
#     'B-EMAIL': 4,
#     'B-ID_NUM': 5,
#     'I-URL_PERSONAL': 6,
#     'B-USERNAME': 7,
#     'B-PHONE_NUM': 8,
#     'I-PHONE_NUM': 9,
#     'B-STREET_ADDRESS': 10,
#     'I-STREET_ADDRESS': 11,
#     'I-ID_NUM': 12
# }

label_map = {
    0: 'O',
    1: 'B-NAME_STUDENT',
    2: 'I-NAME_STUDENT',
    3: 'B-URL_PERSONAL',
    4: 'B-EMAIL',
    5: 'B-ID_NUM',
    6: 'I-URL_PERSONAL',
    7: 'B-USERNAME',
    8: 'B-PHONE_NUM',
    9: 'I-PHONE_NUM',
    10: 'B-STREET_ADDRESS',
    11: 'I-STREET_ADDRESS',
    12: 'I-ID_NUM'
}


In [None]:
data['label'] = data['label'].map(label_map)

In [None]:
data

Unnamed: 0,document,token,label
0,7,Design,O
1,7,Thinking,O
2,7,for,O
3,7,innovation,O
4,7,reflexion,O
...,...,...,...
5807713,29433,overall,O
5807714,29433,success,O
5807715,29433,of,O
5807716,29433,the,O


In [None]:
data['label'].value_counts()

O                   5765152
B-NAME_STUDENT        12469
I-STREET_ADDRESS       8593
I-NAME_STUDENT         6763
B-EMAIL                3833
B-STREET_ADDRESS       3545
I-PHONE_NUM            3404
B-PHONE_NUM            2425
B-URL_PERSONAL          730
B-USERNAME              724
B-ID_NUM                 78
I-URL_PERSONAL            1
I-ID_NUM                  1
Name: label, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:


# Rename columns
data = data.rename(columns={'document': 'sentence_id','token':'words','label':'labels'})

data


Unnamed: 0,sentence_id,words,labels
0,7,Design,O
1,7,Thinking,O
2,7,for,O
3,7,innovation,O
4,7,reflexion,O
...,...,...,...
5807713,29433,overall,O
5807714,29433,success,O
5807715,29433,of,O
5807716,29433,the,O


In [None]:


# Assuming 'data' is your DataFrame and 'text_column' is the name of the column you're tokenizing.
# Convert the column to strings, replace NaN with a placeholder word (e.g., 'NAN') if necessary
data['words'] = data['words'].fillna('NAN').astype(str)

# Now, you can pass this column to the tokenizer
# tokenizer.encode_plus or any relevant tokenizer function calls go here


In [None]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)



# Assuming X and Y are your features and labels respectively
num_data = len(X)
split_index = int(num_data * 0.8)

x_train = X[:split_index]
x_test = X[split_index:]
y_train = Y[:split_index]
y_test = Y[split_index:]


In [None]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [None]:
train_data

Unnamed: 0,sentence_id,words,labels
0,7,Design,O
1,7,Thinking,O
2,7,for,O
3,7,innovation,O
4,7,reflexion,O
...,...,...,...
4646169,25683,software,O
4646170,25683,that,O
4646171,25683,could,O
4646172,25683,seamlessly,O


In [None]:
test_data

Unnamed: 0,sentence_id,words,labels
4646174,25683,with,O
4646175,25683,our,O
4646176,25683,existing,O
4646177,25683,systems,O
4646178,25683,After,O
...,...,...,...
5807713,29433,overall,O
5807714,29433,success,O
5807715,29433,of,O
5807716,29433,the,O


In [None]:
test_data['labels'].value_counts()

O                   1127913
B-NAME_STUDENT         9345
I-STREET_ADDRESS       7280
I-NAME_STUDENT         4777
B-EMAIL                3209
B-STREET_ADDRESS       3004
I-PHONE_NUM            2848
B-PHONE_NUM            2041
B-USERNAME              606
B-URL_PERSONAL          521
Name: labels, dtype: int64

# Model Training


In [None]:
from simpletransformers.ner import NERModel,NERArgs

In [None]:
label = data["labels"].unique().tolist()
label

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-EMAIL',
 'B-ID_NUM',
 'I-URL_PERSONAL',
 'B-USERNAME',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS',
 'I-ID_NUM']

In [None]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32


In [None]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

  return [


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/235 [00:00<?, ?it/s]



(235, 0.071021257729599)

In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)

  return [


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/118 [00:00<?, ?it/s]

In [None]:
result

{'eval_loss': 0.007656405246155951,
 'precision': 0.8655233589591957,
 'recall': 0.8667535236290418,
 'f1_score': 0.8661380044975737}

In [None]:
prediction, model_output = model.predict(["Steve"])

In [None]:
prediction

[[{'Mary': 'O'}]]