### Imports

In [6]:
from pathlib import Path
import pandas as pd

### File Reading

In [7]:
path = Path('.') # cwd
path = path / 'ner_dataset.csv' # adding file path

In [8]:
data = pd.read_csv(path, encoding = 'unicode_escape')

### Analysis

In [9]:
data.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [6]:
# Imputing missing values with last known sentence value
data = data.fillna(method = 'ffill')

In [7]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [8]:
#data['POS'].value_counts()

In [9]:
# Unique words 
words = list(set(data['Word'].values))
n_words = len(words)

In [10]:
n_words

35178

In [11]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [69]:
getter = SentenceGetter(data)

In [70]:
sent = getter.get_next()

In [71]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [72]:
sentences = getter.sentences

In [74]:
sentences = sentences[0:3]

In [18]:
data['Tag'].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [47]:
# Key words in text
start = '<START:food>'
end = '<END>'
apos = "'"

replace_vals = {start:'', end:'', apos:''}
# Function to replace keywords with ''
def replace_all(text, dic):
    '''
    Replace all the words in the text with the items from the dictionary
    '''
    for i,j in dic.items():
        text = text.replace(i,j)
    text = " ".join(text.split())
    return text

In [48]:
train_list = []
# Reading in first 0000 lines of the text file
count = 0
start_line = 0
end_line = 300000

# num_lines = 3

# If it's below the numbers threshold, randomly sample 5% of the data
# otherwise add the numbers between the alloted threshold

with open('review_train', 'r') as f:
    for line in f:
        if count >= start_line and count <= end_line:


        # if count == num_lines:
        #     break
            print(line, sep = '\n')
            train_list.append(line)
        elif count > end_line:
            break
        count +=1
print(count)

<START:food>  fish <END>  tacos which were great 

my friend got the  <START:food>  quesadillas <END>  they were the bomb 

the super  <START:food>  nachos <END>  are off the hook 

i haven't had better  <START:food>  nachos <END>  here in las vegas from one of these fast food type joints 

they are huge, with chicken, shredded  <START:food>  beef <END>  and carne asade 

 <START:food>  chicken <END>  crispy taco was 

el  <START:food>  taco <END>  feliz is just that place 

the  <START:food>  carnitas <END>  tacos are super fatty that is what i liked about them 

the massive amount of  <START:food> meat <END>  is also a plus 

the rolled  <START:food>  tacos <END>  were okay 

my  <START:food>  enchilada <END>  and bean burrito combo was nothing special, just tasted bland 

delicious  <START:food>  mexican food <END>  for cheap 

it was a good size portion of chirzo and eggs, beans, rice, 3  <START:food>  tortillas <END>  

Will stop by again and maybe i'll try the  <START:food>  carn

In [49]:
import spacy

nlp = spacy.load("en_core_web_md")
# Word, POS, Tag
TRAIN_DATA = []
temp_data = []
# Iterate through each review
for sentence in train_list:

    # Get rid of leading/trailing spaces plus '\n'
    sentence = sentence.strip()
    
    # Code to get the all values inside '<START:food>' and '<END>'
    split = sentence.split(start)
    food_items = []
    for i in range(0,len(split)-1):
        food_items.append(split[i+1].split(end)[0].strip())
    #print('food items:', food_items)

    # final_food_items = []
    final_food_items = [val.split() for val in food_items]
    #print('food items:', final_food_items)

    # Removing all marker Keywords, removing extra spaces, and sentence lowercase
    sentence = replace_all(sentence, replace_vals)
    #print(sentence)
    #sentence = ' '.join(sentence.split())
    # sentence = sentence.lower()
    doc = nlp(sentence)
    the_real_final_food_items = []  
    for val in final_food_items:
        for val1 in val:
            the_real_final_food_items.append(val1)
    
    #print(the_real_final_food_items)
    for token in doc:
        check_token = str(token)
        #print(token)
        if check_token in the_real_final_food_items:
            #print(check_token, '(found)')
            temp_data.append((str(token), token.tag_, 'B-foo'))
        elif check_token == 'of':
            temp_data.append((str(token), token.tag_, 'O'))
        else:
            #print(check_token,'(not found)')
            temp_data.append((str(token), token.tag_, 'O'))
    TRAIN_DATA.append(temp_data)
    temp_data = []
    
    
    

In [50]:
sentences = TRAIN_DATA

In [51]:
print(len(sentences))

300001


In [52]:
# import spacy

# nlp = spacy.load("en_core_web_md")

# # for val in train_list:
# #     doc = nlp(val)

# #     for token in doc:
# #         print(token.text, token.tag_)

# doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)

In [53]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [54]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]


In [55]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [56]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report


In [57]:
# pred = cross_val_predict(estimator=crf, X=X, y=y, cv=4)

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
crf.fit(X_train ,y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [22]:
print(crf.classes_)

['O', 'B-foo']


In [23]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-foo']

In [60]:
from sklearn_crfsuite import metrics
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9230402890583345

In [97]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-foo      0.921     0.925     0.923     96363

   micro avg      0.921     0.925     0.923     96363
   macro avg      0.921     0.925     0.923     96363
weighted avg      0.921     0.925     0.923     96363



### Test on Yelp Reviews Data

In [61]:
test_list = [
    'Taco Dale is my go to place for having Mexican food, we love their burrito, Tacos, basically everything They have a great staff who hears you out when placing the order and cooks it perfect as per your needs',
    "My new favorite place I get tacos from around the area and today was my 3rd day in a row, here. All I've had are the tacos. If they're this great, I can't even imagine the rest of the menu! Great restaurant!",
    'My favorite "fast food " Mexican restaurant! Always hot, fresh and tasty!!! The eclairs and shrimp tacos are the best here never dry or overcooked like some other restaurants do.',
    'They have always had the very top tier, best fortune cookies I’ve ever found. They also have the chicken fried rice, which is also really good. ',
    'Good flavor but way overpriced and portions are very small. The crab rangoon is almost all bread, with a miniscule amount of filling. The white rice portion is not even enough to feed one person',
    "Disgusting food!I'm not sure how they have 4.3 rating.The presentation is bad, the taste is bad, General Tso's chicken doesn't even taste like chicken. I'm never going there again",
    'I had ordered 3 lunch specials  that included pastries. Nice portion size btw. Ordered the Orange Chicken, Mongolian Beef, Beef and Broccoli. The Orange Chicken was delicious. Will definitely order again. The Mongolian Beef was kinda weird tasting and had way more onions then beef. not worth getting, unless you want just the onions. :( The Beef and Broccoli was also delicious. A little salty, if your watching your sodium I suggest you skip this dish. However it was good. Oh for got to mention the meat on both beefs where very chewy for some reason. My kids liked the rice, however I didnt enjoy it. To many chunks of onions. I like the fried rice with bean sprouts. Also no egg roll or pop with lunch specials in case you wanted to know. '
]

In [62]:
import spacy

nlp = spacy.load("en_core_web_md")
# Word, POS, Tag
TEST_DATA = []
temp_data = []
# Iterate through each review
for sentence in test_list:

    # Get rid of leading/trailing spaces plus '\n'
    sentence = sentence.strip()
    
    # Code to get the all values inside '<START:food>' and '<END>'
    split = sentence.split(start)
    food_items = []
    for i in range(0,len(split)-1):
        food_items.append(split[i+1].split(end)[0].strip())
    #print('food items:', food_items)

    # final_food_items = []
    final_food_items = [val.split() for val in food_items]
    #print('food items:', final_food_items)

    # Removing all marker Keywords, removing extra spaces, and sentence lowercase
    sentence = replace_all(sentence, replace_vals)
    #print(sentence)
    #sentence = ' '.join(sentence.split())
    # sentence = sentence.lower()
    doc = nlp(sentence)
    the_real_final_food_items = []  
    for val in final_food_items:
        for val1 in val:
            the_real_final_food_items.append(val1)
    
    #print(the_real_final_food_items)
    for token in doc:
        check_token = str(token)
        #print(token)
        if check_token in the_real_final_food_items:
            #print(check_token, '(found)')
            temp_data.append((str(token), token.tag_, 'B-foo'))
        elif check_token == 'of':
            temp_data.append((str(token), token.tag_, 'O'))
        else:
            #print(check_token,'(not found)')
            temp_data.append((str(token), token.tag_, 'O'))
    TEST_DATA.append(temp_data)
    temp_data = []
    
    
    

In [65]:
X_test = [sent2features(s) for s in TEST_DATA]

In [66]:
y_pred = crf.predict(X_test)

In [70]:
# y_pred

In [67]:
# for ind, val in enumerate(TEST_DATA):
#     print(TEST_DATA[ind][0])

In [71]:
final_1 = []
for lst in TEST_DATA:
    for ele in lst:
        final_1.append(ele[0])
final_2 = []
for lst in y_pred:
    for ele in lst:
        final_2.append(ele[0:])
for val in zip(final_1, final_2):
    print(val)

('Taco', 'O')
('Dale', 'O')
('is', 'O')
('my', 'O')
('go', 'O')
('to', 'O')
('place', 'O')
('for', 'O')
('having', 'O')
('Mexican', 'O')
('food', 'O')
(',', 'O')
('we', 'O')
('love', 'O')
('their', 'O')
('burrito', 'O')
(',', 'O')
('Tacos', 'O')
(',', 'O')
('basically', 'O')
('everything', 'O')
('They', 'O')
('have', 'O')
('a', 'O')
('great', 'O')
('staff', 'O')
('who', 'O')
('hears', 'O')
('you', 'O')
('out', 'O')
('when', 'O')
('placing', 'O')
('the', 'O')
('order', 'O')
('and', 'O')
('cooks', 'O')
('it', 'O')
('perfect', 'O')
('as', 'O')
('per', 'O')
('your', 'O')
('needs', 'O')
('My', 'O')
('new', 'O')
('favorite', 'O')
('place', 'O')
('I', 'O')
('get', 'O')
('tacos', 'B-foo')
('from', 'O')
('around', 'O')
('the', 'O')
('area', 'O')
('and', 'O')
('today', 'O')
('was', 'O')
('my', 'O')
('3rd', 'O')
('day', 'O')
('in', 'O')
('a', 'O')
('row', 'O')
(',', 'O')
('here', 'O')
('.', 'O')
('All', 'O')
('I', 'O')
('ve', 'O')
('had', 'O')
('are', 'O')
('the', 'O')
('tacos', 'O')
('.', 'O')
(

In [72]:
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [73]:
import eli5

In [74]:
eli5.show_weights(crf, top=30)


From \ To,O,B-foo
O,1.253,-0.453
B-foo,-0.378,-0.363

Weight?,Feature
Weight?,Feature
+7.708,-1:word.lower():cracker
+6.250,word.lower():fast
+6.248,word.lower():good
+5.710,-1:word.lower():porterhouse
+5.543,word.lower():makes
+5.514,word.lower():chicago
+5.475,word.lower():great
+5.457,word.lower():street
+5.395,word.lower():chiles
+5.375,word.lower():dishes

Weight?,Feature
+7.708,-1:word.lower():cracker
+6.250,word.lower():fast
+6.248,word.lower():good
+5.710,-1:word.lower():porterhouse
+5.543,word.lower():makes
+5.514,word.lower():chicago
+5.475,word.lower():great
+5.457,word.lower():street
+5.395,word.lower():chiles
+5.375,word.lower():dishes

Weight?,Feature
+10.110,word.lower():bitterness
+8.755,word.lower():microwaves
+7.790,word.lower():blond
+7.754,word.lower():anglaise
+7.718,word.lower():billy
+7.532,word.lower():betty
+7.393,word.lower():copper
+7.290,word.lower():calf
+7.202,word.lower():picnics
+7.173,word.lower():dehydration


In [74]:
# Testing on yelp data
import pandas as pd

In [75]:
df = pd.read_csv('./reviews.csv')

In [76]:
df.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,0,xQY8N_XvtGbearJ5X4QryQ,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,5,0,0,"As someone who has worked with many museums, I...",2015-04-15 05:21:16
1,1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,I am actually horrified this place is still in...,2013-12-07 03:16:52
2,2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11
3,3,i6g_oA9Yf9Y31qt0wibXpw,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",2011-05-27 05:30:52
4,4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4,0,0,0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:57


In [77]:
df1 = df['text'][0:3]

In [78]:
df1

0    As someone who has worked with many museums, I...
1    I am actually horrified this place is still in...
2    I love Deagan's. I do. I really do. The atmosp...
Name: text, dtype: object