In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from CoNLL2Spacy import *

In [None]:
file = open("/content/drive/MyDrive/mini-dataset/Data/nertrnweather.txt", "r",encoding = "utf-8") 
valList = []
for line in file:
    valList.append(line[:-1])
valList[:20]

['अरे O',
 'सारथी O',
 'जम्मू U-location',
 'कैसा O',
 'मौसम O',
 'है O',
 '',
 'अभी U-date',
 'दिल्ली U-location',
 'में O',
 'कैसा O',
 'मौसम O',
 'है O',
 '',
 'आज U-date',
 'के O',
 'लिए O',
 'मौसम O',
 'रिपोर्ट O',
 'बताओ O']

In [None]:
TEST_DATA = conll2spacy(valList)

In [None]:
file = open("/content/drive/MyDrive/mini-dataset/Data/nertrnweather.txt", "r",encoding = "utf-8") 
trainList = []
for line in file:
    trainList.append(line[:-1])
trainList[:20]

['अरे O',
 'सारथी O',
 'जम्मू U-location',
 'कैसा O',
 'मौसम O',
 'है O',
 '',
 'अभी U-date',
 'दिल्ली U-location',
 'में O',
 'कैसा O',
 'मौसम O',
 'है O',
 '',
 'आज U-date',
 'के O',
 'लिए O',
 'मौसम O',
 'रिपोर्ट O',
 'बताओ O']

In [None]:
TRAIN_DATA = conll2spacy(trainList)

In [None]:
import spacy
import numpy
import pickle

In [None]:
lang = "hi"
vectors_loc = "/content/drive/MyDrive/mini-dataset/Data/fasttextwiki/wiki.hi.vec"
nlp = spacy.blank(lang)    
with open(vectors_loc, "rb") as file_:        
    header = file_.readline()        
    nr_row, nr_dim = header.split()        
    nlp.vocab.reset_vectors(width=int(nr_dim))        
    for line in file_:            
        line = line.rstrip().decode("utf8")            
        pieces = line.rsplit(" ", int(nr_dim))            
        word = pieces[0]            
        vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")            
        nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab   

In [None]:
text = "भारी बारिश के कारण आज कार्यालय बंद रहेगा"    
doc = nlp(text)    
print("similarity btw",doc[0] , "and", doc[3]," :-", doc[0].similarity(doc[3]))

similarity btw भारी and कारण  :- 0.41513643


In [None]:
import spacy
import random # random function for to remove bais in Traning Data

# for batch parsing 
from spacy.util import minibatch, compounding

# For evaluateing the model from testing set
from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [None]:
def train_spacy(TRAIN_DATA,TEST_DATA,iterations,droprate = 0.5,modelName = "modelTrained"):

    # loading hindi model and using vector from fasttext
    lang = "hi"
    vectors_loc = "/content/drive/MyDrive/mini-dataset/Data/fasttextwiki/wiki.hi.vec"
    modiner = spacy.blank(lang)    
    with open(vectors_loc, "rb") as file_:        
        header = file_.readline()        
        nr_row, nr_dim = header.split()        
        modiner.vocab.reset_vectors(width=int(nr_dim))        
        for line in file_:            
            line = line.rstrip().decode("utf8")            
            pieces = line.rsplit(" ", int(nr_dim))            
            word = pieces[0]            
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")            
            modiner.vocab.set_vector(word, vector)  # add the vectors to the vocab    

#     modiner = spacy.blank('en')  # create blank Language class
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in modiner.pipe_names:
        ner = modiner.create_pipe('ner')
        modiner.add_pipe(ner, last=True)
     
    # setting up f1score
    f1score = 0.0000

    
    # add labels that will be involved in training 
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in modiner.pipe_names if pipe != 'ner']
    with modiner.disable_pipes(*other_pipes):  # only train NER
        optimizer = modiner.begin_training()
        
        # --Iterations Starts--
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            #--Shuffling Traning Data--
            random.shuffle(TRAIN_DATA)
            losses = {}
            
                      
                    
            # batch Traning For better Training and Learning of model
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 16.0, 1.01))
            for batch in batches:
                texts, annotations = zip(*batch)
                modiner.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=droprate,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print(losses)
            
            
            # Evaluating the Current Model Score on test data
            results = evaluate(modiner, TEST_DATA)
            print("Current Score :-",results["ents_f"], "Precision  :-",results["ents_p"], "Recall  :-",results["ents_r"])
            

            
            # loading previous best saved model in start of traning 
            if f1score == 0.00:
                try:                    
                    pnlp = spacy.load(modelName)
                    result = evaluate(pnlp, TEST_DATA) # calling evaluate function 
                    f1score = result["ents_f"]
                except:
                    print("Previous Model not found")
                    
            print("Best Sccore :- ",f1score)
            print("------------------------------------")
            # finding out the best score
            if f1score < results["ents_f"]:
                f1score = results["ents_f"]
                
                # Save our trained Model if the score if grater than best score else no change in previous model
                modiner.to_disk(modelName)
                
    print("-----Best Model is Saved-----")


In [None]:
def evaluate(ner_model, examples):
    scorer = Scorer()
     
    #loading tags for each input and Evaluating them
    for input_, annotations in examples:
        tags = []
        # loading text
        doc_gold_text = ner_model.make_doc(input_)
        
        #loading all tags for that text
        for ent in annotations.get('entities'):
            tags.append(ent)
            
        # Evaluating the tags    
        gold = GoldParse(doc_gold_text, entities=tags)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
        
        
    return scorer.scores


In [None]:
def loadNERModel(modelName = "modelTrained"):
    nlp = spacy.load(modelName)
    return nlp

In [None]:
def score(model,TEST_DATA):
    result = evaluate(model, TEST_DATA) # calling evaluate function 
    f1score = result["ents_f"]
    precision = result["ents_p"]
    recall = result["ents_r"]
    print("F1 score of Model is :-",f1score)
    print("Precision of Model is :-",precision)
    print("Recall of Model is :-",recall)

In [None]:
# traning the model with 100 iterations
train_spacy(TRAIN_DATA,TEST_DATA, 100,droprate = 0.55, modelName = "hindiNER")

  "__main__", mod_spec)


Starting iteration 0
{'ner': 808.4360901527107}
Current Score :- 0.0 Precision  :- 0.0 Recall  :- 0.0
Previous Model not found
Best Sccore :-  0.0
------------------------------------
Starting iteration 1
{'ner': 500.7508350031899}
Current Score :- 70.32474804031355 Precision  :- 72.01834862385321 Recall  :- 68.7089715536105
Previous Model not found
Best Sccore :-  0.0
------------------------------------
Starting iteration 2
{'ner': 333.01709009824174}
Current Score :- 83.71584699453551 Precision  :- 83.62445414847161 Recall  :- 83.80743982494529
Best Sccore :-  70.32474804031355
------------------------------------
Starting iteration 3
{'ner': 215.93908924260512}
Current Score :- 89.247311827957 Precision  :- 87.73784355179704 Recall  :- 90.80962800875274
Best Sccore :-  83.71584699453551
------------------------------------
Starting iteration 4
{'ner': 178.41262751439854}
Current Score :- 93.73650107991361 Precision  :- 92.53731343283582 Recall  :- 94.9671772428884
Best Sccore :-  8

In [None]:
# loading the saved model
pnlp = loadNERModel("hindiNER")

In [None]:
# calculating the score of the model
score(pnlp,TEST_DATA)

F1 score of Model is :- 100.0
Precision of Model is :- 100.0
Recall of Model is :- 100.0


In [None]:
from spacy import displacy

In [None]:
testcase = pnlp("इस मौसम में बाइक को चलाना सुरक्षित नहीं है अन्यथा यह स्किड हो जाएगी।")
displacy.render(testcase, style='ent', jupyter=True)