In [2]:
import pandas as pd
import numpy as np
from flair.nn import Classifier
from flair.data import Sentence

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
classifier = Classifier.load('ner')

2023-07-14 12:02:11,393 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [5]:
# make a sentence
sentence = Sentence('Chapel')

# run NER over sentence
classifier.predict(sentence)

# print the sentence with all annotations
print(sentence)

Sentence[1]: "Chapel"


In [6]:
def classify_named_entities(text):
    if pd.isnull(text):
        return []
    else:
        text = text.capitalize()
        sentence = Sentence(text)
        classifier.predict(sentence)
        labels = [(entity.text, entity.labels[0].value) for entity in sentence.get_spans('ner')]
        return labels

In [7]:
df = pd.DataFrame({'Text': ['Apple Phones.', 'd']})
df['Label'] = df['Text'].apply(classify_named_entities)
df

Unnamed: 0,Text,Label
0,Apple Phones.,"[(Apple, ORG)]"
1,d,[]


In [8]:
rising_queries_all = pd.read_csv("C:/Users/varun/exported_files/rising_quer_all.csv")
rising_queries_all = pd.read_csv("C:/Users/varun/exported_files/top_quer_all.csv")

In [10]:
rising_quer_cats = pd.DataFrame()
col_num = len(rising_queries_all)
for cols in rising_queries_all.iloc[:, 2:col_num-1].columns:
    rising_quer_cats[cols] = rising_queries_all[cols].apply(classify_named_entities)

rising_quer_cats

Unnamed: 0,TopQueries1,TopQueries2,TopQueries3,TopQueries4,TopQueries5,TopQueries6,TopQueries7,TopQueries8,TopQueries9,TopQueries10,...,TopQueries16,TopQueries17,TopQueries18,TopQueries19,TopQueries20,TopQueries21,TopQueries22,TopQueries23,TopQueries24,TopQueries25
0,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2,"[(Vatican, MISC)]",[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
3,"[(Liberal, MISC)]",[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
4,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
827,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
828,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
829,[],"[(Google, ORG)]",[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


In [13]:
unique_values, unique_inds = np.unique(rising_quer_cats.values.flatten(), return_index = True)
unique_values, len(unique_inds)

(array([list([]), list([('Adam', 'PER')]), list([('African', 'MISC')]),
        list([('Aibu', 'ORG')]), list([('Anarcha', 'MISC')]),
        list([('Andrew tate', 'PER')]), list([('Angela davis', 'PER')]),
        list([('Arvo', 'PER')]), list([('Bell', 'PER')]),
        list([('Betty friedan', 'PER')]), list([('Beyonce', 'ORG')]),
        list([('Beyonce', 'PER')]), list([('Buzzfeed', 'ORG')]),
        list([('Caitlin', 'PER')]), list([('Christian', 'MISC')]),
        list([('Clinton', 'PER')]), list([('Delhi', 'LOC')]),
        list([('Dominic raab', 'PER')]), list([('Egalitarian', 'ORG')]),
        list([('Emma watson', 'PER')]), list([('Fema', 'ORG')]),
        list([('Femail', 'ORG')]), list([('Feminazi', 'ORG')]),
        list([('Feministe', 'ORG')]), list([('French', 'MISC')]),
        list([('Germaine', 'PER')]), list([('Gloria steinem', 'PER')]),
        list([('Google', 'ORG')]), list([('Greens', 'ORG')]),
        list([('Hillary clinton', 'PER')]), list([('Hindi', 'MISC')])