# Installing dependencies and importing libray

In [None]:
! pip install simpletransformers

In [2]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

from simpletransformers.ner import NERModel, NERArgs



In [3]:
# Reading the csv file
df = pd.read_csv("/content/drive/MyDrive/NER_data/total_ner_data.csv")

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df[['SentenceID','Token']], df['Label'], test_size = 0.2, random_state=93)

In [5]:
train_data = pd.DataFrame({'SentenceID':x_train["SentenceID"],"Token":x_train['Token'], "Label":y_train})
test_data = pd.DataFrame({'SentenceID':x_test["SentenceID"],"Token":x_test['Token'], "Label":y_test})

In [6]:
train_data.columns = ['sentence_id', 'words', 'labels']
test_data.columns = ['sentence_id', 'words', 'labels']

In [7]:
train_data

Unnamed: 0,sentence_id,words,labels
44667,1715,को,O
97360,5209,त्यस,O
3180,634,।,O
119638,6639,को,O
139222,7382,राजनीति,O
...,...,...,...
121941,6727,पनि,O
13464,6586,ना,O
175492,8771,लिमिटेड,I-ORG
79195,4062,छैन,O


In [8]:
test_data

Unnamed: 0,sentence_id,words,labels
185118,9134,सद्भाव,O
151144,7842,।,O
35062,1083,कहिलै,O
54877,2359,ले,O
155739,8027,स्थापना,O
...,...,...,...
149054,7766,गर्दै,O
65874,3127,लाग्छ,O
117751,6574,का,O
88882,4605,कुरा,O


In [9]:
print(train_data['labels'].value_counts())
print("------------------")
print(test_data['labels'].value_counts())

O           147107
B-PER         3342
B-ORG         2546
B-LOC         2522
I-PER         2053
I-ORG         1953
B-DATE         644
I-DATE         410
I-LOC          243
PHONE          109
USERNAME       100
I-EVENT         78
B-EVENT         31
Name: labels, dtype: int64
------------------
O           36768
B-PER         833
B-LOC         636
B-ORG         606
I-ORG         529
I-PER         497
B-DATE        155
I-DATE        108
I-LOC          66
PHONE          33
USERNAME       26
I-EVENT        22
B-EVENT         6
Name: labels, dtype: int64


In [10]:
label = df['Label'].unique().tolist()
label

['O',
 'B-PER',
 'B-ORG',
 'B-DATE',
 'I-DATE',
 'I-ORG',
 'I-PER',
 'B-LOC',
 'I-LOC',
 'B-EVENT',
 'I-EVENT',
 'PHONE',
 'USERNAME']

# Setting parameters for training

In [11]:
# !rm -r cache_dir/ outputs/
epochs = 5
lr = 1e-4
train_bs = 32
eval_bs = 32
# seq_length = 512

In [12]:
args = NERArgs(
        num_train_epochs=epochs,
        learning_rate=lr,
        overwrite_output_dir=True,
        # max_seq_length =seq_length,
        train_batch_size=train_bs,
        eval_batch_size=eval_bs,

)

In [13]:

# NepBERTa/NepBERTa
model = NERModel('bert', 'NepBERTa/NepBERTa', labels=label, args=args, from_tf = True)

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/534M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.


vocab.txt:   0%|          | 0.00/547k [00:00<?, ?B/s]

# Training the model

In [14]:
history = model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

  return [


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/305 [00:00<?, ?it/s]



Running Epoch 1 of 5:   0%|          | 0/305 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/305 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/305 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/305 [00:00<?, ?it/s]

In [15]:
result, model_outputs, preds_list = model.eval_model(test_data)
result

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/285 [00:00<?, ?it/s]



{'eval_loss': 0.18488737820043114,
 'precision': 0.7891394093362972,
 'recall': 0.7367328787429588,
 'f1_score': 0.7620361852192578}

# Save the model


In [16]:
# Save the model
import torch
torch.save(model, 'classifier')

# Demo using gradio

In [17]:
%%capture
!pip install indic-nlp-library
!pip install gradio

In [18]:
import gradio as gr
from indicnlp.tokenize import sentence_tokenize

def break_into_sentences(paragraph):
    sentences = sentence_tokenize.sentence_split(paragraph, lang='ne')
    return sentences

def predict(text, model=model):
    ner = []
    text = break_into_sentences(text)

    prediction, model_output = model.predict(text)
    filtered_data = [[d for d in sublist if d[list(d.keys())[0]] != 'O'] for sublist in prediction]
    for i in filtered_data:
      if len(i)!=0:
        ner.append(i)
    return ner

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(),
    outputs='text',
    title='NER-Nepali-using-NepBERTa'
)
iface.launch()
# result = predict("मने काँग्रेसलाई एकपट्क फेरि एमालेलाई दोश्रो पटक पालो गरि गरि धम्क्याउँने र ठेगान लगाउँन सक्ने एकजना मानिस मात्र फेला")

ImportError: ignored