# Named Entity Recognition Using Pre-trained Bert Model.

In [1]:
import pandas as pd
data_directory_train = "/content/restauranttrain.txt"
data_directory_test = "/content/restauranttest.txt"

#### Reading data from the txt file.

In [2]:
def read_lines_from_data(file):
  with open(file, "r") as f:
    lines = f.readlines()
  return lines

In [3]:
train_data_raw = read_lines_from_data(data_directory_train)
test_data_raw = read_lines_from_data(data_directory_test)

#### Pre-processing the data to convert into Pandas DataFrame

In [4]:
def preprocessing_data(filename):
  lines = read_lines_from_data(filename)
  list_of_dict = []
  i = 1
  for line in lines:
    if line == "\n":
      i+=1
    else:
      dict_for_lines = {}
      split_line = line.split("\t")
      dict_for_lines["Sentence #"] = f"Sentence {i}"
      dict_for_lines["Word"] = split_line[1].strip("\n")
      dict_for_lines["Tag"] =split_line[0]
      list_of_dict.append(dict_for_lines)
  return list_of_dict

In [5]:
list_dic_train = preprocessing_data(data_directory_train)
list_dic_test = preprocessing_data(data_directory_test)

In [6]:
df_train = pd.DataFrame(list_dic_train)
df_test = pd.DataFrame(list_dic_test)

In [8]:
df_train[:15]

Unnamed: 0,Sentence #,Word,Tag
0,Sentence 1,2,B-Rating
1,Sentence 1,star,I-Rating
2,Sentence 1,restaurants,O
3,Sentence 1,with,O
4,Sentence 1,inside,B-Amenity
5,Sentence 1,dining,I-Amenity
6,Sentence 2,34,O
7,Sentence 3,5,B-Rating
8,Sentence 3,star,I-Rating
9,Sentence 3,resturants,O


#### Installing Simpletransformers Library and Importing required Sk-learn Packages

In [None]:
!pip install simpletransformers

In [44]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

#### Encoding Sentence number for better positional awarness of the words for model

In [45]:
df_train["Sentence #"] = LabelEncoder().fit_transform(df_train["Sentence #"])
df_test["Sentence #"] = LabelEncoder().fit_transform(df_test["Sentence #"])

# Renaming the columnn labels as the model particularly accepts the changed Ids

In [46]:
training_data = pd.DataFrame({"sentence_id":df_train['Sentence #'], "words": df_train["Word"], "labels": df_train["Tag"]})
testing_data = pd.DataFrame({"sentence_id":df_test['Sentence #'], "words": df_test["Word"], "labels": df_test["Tag"]})


#### Extracting Unique Labels list to pass to the model

In [47]:
label = training_data["labels"].unique().tolist()

#### Importing Model

In [48]:
from simpletransformers.ner import NERModel, NERArgs

#### Defining Hyper-parameters

In [49]:
hyperparameters = NERArgs()
hyperparameters.num_train_epochs = 4
hyperparameters.train_batch_size = 64
hyperparameters.eval_batch_size = 64
hyperparameters.learning_rate = 0.0001
hyperparameters.overwrite_output_dir = True

#### Loading pretrained BERT powerd NER Model

In [50]:
model = NERModel('bert', "bert-base-cased", labels = label, args=hyperparameters)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

#### Model Training

In [53]:
model.train_model(training_data, eval_data = testing_data, acc = accuracy_score)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/120 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/120 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/120 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/120 [00:00<?, ?it/s]

(480, 0.1251331423680919)

#### Model Testing

In [54]:
accuracy_sc, _ , _ = model.eval_model(testing_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/24 [00:00<?, ?it/s]

#### Accuracy Scores

In [55]:
accuracy_sc


{'eval_loss': 0.3722156696021557,
 'precision': 0.758516731986735,
 'recall': 0.7984766740717233,
 'f1_score': 0.7779839208410637}

#### Predicting the sentence

In [56]:
prediction, model_output = model.predict(["I love hong kong resturant, tere is beautiful dining space and they serve excelent soup and fish curry"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

## Sample Prediction

In [57]:
prediction

[[{'I': 'O'},
  {'love': 'O'},
  {'hong': 'B-Restaurant_Name'},
  {'kong': 'I-Restaurant_Name'},
  {'resturant,': 'O'},
  {'tere': 'O'},
  {'is': 'O'},
  {'beautiful': 'B-Amenity'},
  {'dining': 'I-Amenity'},
  {'space': 'I-Amenity'},
  {'and': 'O'},
  {'they': 'O'},
  {'serve': 'O'},
  {'excelent': 'B-Dish'},
  {'soup': 'I-Dish'},
  {'and': 'O'},
  {'fish': 'B-Dish'},
  {'curry': 'I-Dish'}]]

#### Prediction Function

In [59]:
def prediction_fun(model):
  sentence = input("Please Input your sentence Here::: ")
  list_1 = []
  list_1.append(sentence)
  prediction, model_output = model.predict(list_1)
  return prediction


In [None]:
pred = prediction_fun(model)
pred

In [None]:
!zip -r /content/cache_dir.zip /content/cache_dir
!zip -r /content/outputs.zip /content/outputs
!zip -r /content/runs.zip /content/runs

In [None]:
from google.colab import files
files.download("/content/file.zip")