In [None]:
# Mounting over google drive
from google.colab import drive
drive.mount("TwitterSupport")

Mounted at TwitterSupport


In [None]:
# Root dir
ROOT_DIR="TwitterSupport/MyDrive/TwitterSupport/"

In [None]:
#Named Entity Recognition 
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("Virgin America is recognized as leader in market",{"entities": [(0,14, "ORG")]}),
              ("Virgin America is the best airline ever",{"entities": [(0,14, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]}),
              ("Projectpro_test is a great airline.", {"entities" : [(0,15, "ORG")]}),
              ("Projectpro_test is a great airline.", {"entities" : [(0,15, "ORG")]}),
              ("Projectpro_test is a great airline.", {"entities" : [(0,15, "ORG")]}),
              ("Projectpro_test is a great airline.", {"entities" : [(0,15, "ORG")]})
              ]

In [None]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
 import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 5.098098905153165}
Losses {'ner': 12.230556102796982}
Losses {'ner': 13.75167132565366}
Losses {'ner': 15.848025918652638}
Losses {'ner': 19.49425376934778}
Losses {'ner': 25.13032833158468}
Losses {'ner': 25.209815079420878}
Losses {'ner': 0.0697075542411767}
Losses {'ner': 2.1810758713108953}
Losses {'ner': 5.205351497599622}
Losses {'ner': 9.859327055135509}
Losses {'ner': 11.048176070529735}
Losses {'ner': 17.126382625603583}
Losses {'ner': 18.741598498841945}
Losses {'ner': 0.6426653625094332}
Losses {'ner': 0.779853617597837}
Losses {'ner': 2.2639738549002004}
Losses {'ner': 4.159977092317263}
Losses {'ner': 8.88356231795899}
Losses {'ner': 13.680505505672613}
Losses {'ner': 13.883825901407818}
Losses {'ner': 4.897011601451283}
Losses {'ner': 5.024953328834869}
Losses {'ner': 8.528766405934704}
Losses {'ner': 10.817291322330249}
Losses {'ner': 13.816531997107205}
Losses {'ner': 14.721547621084511}
Losses {'ner': 14.751109501637274}
Losses {'ner': 0.1595769617124460

In [None]:
doc = nlp("I was driving a Projectpro_test")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Projectpro_test', 'ORG')]


In [None]:
# Save the  model to directory
output_dir = Path(ROOT_DIR+'model_NER/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to TwitterSupport/MyDrive/TwitterSupport/model_NER


In [None]:
camel_case_split("VirginAmerica")

'@Virgin America'

In [None]:
def camel_case_split(str):
    words = [[str[0]]]
  
    for c in str[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c))
        else:
            words[-1].append(c)
  
    return " ".join([''.join(word) for word in words])

In [None]:
import re
import string

def clean_text( txt):
    
      """
      removing all hashtags , punctuations, stop_words  and links, also stemming words 
      """
      txt = " ".join([camel_case_split(t) for t in txt.split(" ")])
      txt = re.sub(r"(?<=\w)nt", "not",txt) #change don't to do not cna't to cannot 
      txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie 
      txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations 
      txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
      txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
      txt = re.sub(r"(http\S+|http)", "", txt) # remove links 
      return txt

In [None]:
clean_text("@VirginAmerica why are your first fares in May over three times more than other carriers when all seats are available to select???")

' Virgin America why are your first fares in May over three times more than other carriers when all seats are available to select '

In [None]:
doc = nlp(' Virgin America why are your first indian airline fares in May over three times more than other carriers when all seats are available to select ')
print([(X.text, X.label_) for X in doc.ents])

[('Virgin America', 'ORG')]


In [None]:
nlp = en_core_web_sm.load()

In [None]:
doc = nlp(' Virgin America why are your first indian airline fares in May over three times more than other carriers when all seats are available to select ')
print([(X.text, X.label_) for X in doc.ents])

[('Virgin America', 'LOC'), ('first', 'ORDINAL'), ('indian', 'NORP'), ('May', 'DATE'), ('three', 'CARDINAL')]


In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import re
import string
from pathlib import Path

class NameEntities:

  def __init__(self):

      # ROOT_DIR = ""
      self.nlp = en_core_web_sm.load() # Load the saved model and predict
      output_dir = Path(ROOT_DIR+'model_NER/')
      print("Loading from", output_dir)
      self.nlp_updated = spacy.load(output_dir)

  def clean_text(self, txt):
      """
      removing all hashtags , punctuations, stop_words  and links, also stemming words 
      """
      txt = " ".join([self.camel_case_split(t) for t in txt.split(" ")])
      txt = re.sub(r"(?<=\w)nt", "not",txt) #change don't to do not cna't to cannot 
      txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie 
      txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations 
      txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
      txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
      txt = re.sub(r"(http\S+|http)", "", txt) # remove links 
      return txt


  def camel_case_split(self, str):
    words = [[str[0]]]
  
    for c in str[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c))
        else:
            words[-1].append(c)
  
    return " ".join([''.join(word) for word in words])

  
  def get_Entities(self, text):
      text = self.clean_text(text)
      doc = self.nlp_updated(text)
      labels = [(X.text, X.label_) for X in doc.ents]

      doc = self.nlp(text)
      labels_norm = [(X.text, X.label_) for X in doc.ents]
      labels.extend(labels_norm)

      return labels



In [None]:
ner = NameEntities()

Loading from TwitterSupport/MyDrive/TwitterSupport/model_NER


In [None]:
ner.get_Entities("Projectpro_test is a good airline")

[('Projectprotest', 'ORG'), ('Projectprotest', 'ORG')]