In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import math

# sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score

In [11]:
df = pd.read_csv('/content/ner-train.csv', encoding='latin1')
df.loc[-1] = df.columns.values
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)
df.columns = ['Sentence #','Word','POS','Tag','cx']
df.drop(['cx'],inplace=True, axis=1)
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
875417,Sentence: 39999,by,IN,O
875418,Sentence: 39999,Imperial,NNP,B-org
875419,Sentence: 39999,Sugar,NNP,I-org
875420,Sentence: 39999,Company,NN,I-org


In [12]:
X = df.drop(['Tag', 'Sentence #'], axis=1)
X.head()

Unnamed: 0,Word,POS
0,Thousands,NNS
1,of,IN
2,demonstrators,NNS
3,have,VBP
4,marched,VBN


In [13]:
v = DictVectorizer(sparse=True)
X = v.fit_transform(X.to_dict('records'))
X.shape

(875422, 32410)

In [14]:
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=0)
X_train.shape, y_train.shape

((787879, 32410), (787879,))

In [16]:
X_val.shape, y_val.shape

((87543, 32410), (87543,))

## Model

In [17]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
y_val_pred = sgd.predict(X_val)

In [19]:
print(classification_report(y_val, y_val_pred, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        40
       B-eve       0.30      0.12      0.18        24
       B-geo       0.60      0.91      0.73      3115
       B-gpe       0.93      0.72      0.81      1316
       B-nat       0.00      0.00      0.00        16
       B-org       0.78      0.42      0.55      1673
       B-per       0.81      0.56      0.66      1407
       B-tim       0.89      0.65      0.75      1655
       I-art       0.00      0.00      0.00        28
       I-eve       0.71      0.19      0.29        27
       I-geo       0.78      0.60      0.68       612
       I-gpe       1.00      0.10      0.18        10
       I-nat       0.00      0.00      0.00         3
       I-org       0.68      0.54      0.60      1399
       I-per       0.76      0.62      0.68      1455
       I-tim       0.45      0.03      0.05       533

   micro avg       0.72      0.63      0.67     13313
   macro avg       0.54   

## Inference

In [39]:
infer_sentences = ['The lands that today comprise Croatia were part of the Austro-Hungarian Empire until the close of World War I .',
                   "Much of New Orleans sits below sea level , and the levees ' failure during Hurricane Katrina put 80 percent of the city underwater .",
                   "A separate report says the number of people who lost jobs because of Hurricanes <B-nat> Katrina <I-nat> , Rita <B-nat> , and Wilma <B-org> now exceeds $ 6,00,000 .",
                   "Google is in New York, London, Paris and Tokyo .",
                   "Our homework is due on Wednesday, February 1st .",
                   "Donald Trump is the president of United States .",
                   "I love Indian food ."]

for sent in infer_sentences:

  t = sent.split()
  
  infer_dict = {
      'Word': t
  }

  infer_df = pd.DataFrame(infer_dict)

  infer_v = v.transform(infer_df.to_dict('records'))

  infer_pred = sgd.predict(infer_v)

  print('-'*50)
  print(t)
  print(infer_pred)
  print('-'*50)

--------------------------------------------------
['The', 'lands', 'that', 'today', 'comprise', 'Croatia', 'were', 'part', 'of', 'the', 'Austro-Hungarian', 'Empire', 'until', 'the', 'close', 'of', 'World', 'War', 'I', '.']
['O' 'O' 'O' 'B-tim' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O']
--------------------------------------------------
--------------------------------------------------
['Much', 'of', 'New', 'Orleans', 'sits', 'below', 'sea', 'level', ',', 'and', 'the', 'levees', "'", 'failure', 'during', 'Hurricane', 'Katrina', 'put', '80', 'percent', 'of', 'the', 'city', 'underwater', '.']
['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O']
--------------------------------------------------
--------------------------------------------------
['A', 'separate', 'report', 'says', 'the', 'number', 'of', 'people', 'who', 'lost', 'jobs', 'because', 'of', 'Hurricanes', '<B-nat>', 'Katrina', '<I-nat>', ',', 'Rita', '<B-

## Evaluate on Test Set

In [23]:
df_test = pd.read_csv('/content/ner-test.csv', encoding='latin1')
df_test.loc[-1] = df_test.columns.values
df_test.sort_index(inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_test.columns = ['Sentence #','Word','POS','Tag','cx']
df_test.drop(['cx'],inplace=True, axis=1)
df_test

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 40000,A,DT,O
1,Sentence: 40000,company,NN,O
2,Sentence: 40000,spokesman,NN,O
3,Sentence: 40000,says,VBZ,O
4,Sentence: 40000,the,DT,O
...,...,...,...,...
173148,Sentence: 47959,they,PRP,O
173149,Sentence: 47959,responded,VBD,O
173150,Sentence: 47959,to,TO,O
173151,Sentence: 47959,the,DT,O


In [24]:
X_test = df_test.drop('Tag', axis=1)
X_test_v = v.transform(X_test.to_dict('records'))

In [25]:
y_test = df_test.Tag.values

In [26]:
X_test.shape, y_test.shape

((173153, 3), (173153,))

In [27]:
y_pred = sgd.predict(X_test_v)

In [28]:
print(classification_report(y_test, y_pred, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.25      0.03      0.05        75
       B-eve       0.54      0.34      0.42        41
       B-geo       0.59      0.92      0.72      6194
       B-gpe       0.95      0.73      0.83      2757
       B-nat       0.82      0.39      0.53        36
       B-org       0.79      0.43      0.56      3400
       B-per       0.83      0.56      0.67      2784
       B-tim       0.90      0.65      0.75      3431
       I-art       0.00      0.00      0.00        41
       I-eve       1.00      0.17      0.29        35
       I-geo       0.77      0.61      0.68      1142
       I-gpe       0.00      0.00      0.00        40
       I-nat       0.00      0.00      0.00        10
       I-org       0.71      0.53      0.61      2880
       I-per       0.77      0.60      0.68      2820
       I-tim       0.53      0.03      0.05      1102

   micro avg       0.72      0.63      0.67     26788
   macro avg       0.59   

In [29]:
# compact

# function
def new_ents(tag):
  if tag == 'O':
    return 'O'
  else:
    return tag[-3:]

df_test['Pred_Tag'] = y_pred

# apply on column with new name
df_test['True_Tag2'] = df_test['Tag'].apply(lambda x: new_ents(x))
df_test['Pred_Tag2'] = df_test['Pred_Tag'].apply(lambda x: new_ents(x))

# get tags that are not O
not_o = df_test[df_test.True_Tag2 != 'O'].index

# true and pred of those tags
true_tag_not_o = df_test.loc[not_o, 'True_Tag2']
pred_tag_not_o = df_test.loc[not_o, 'Pred_Tag2']

# measure
print(classification_report(true_tag_not_o, pred_tag_not_o))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
         art       0.02      0.02      0.02       116
         eve       0.75      0.28      0.40        76
         geo       0.66      0.91      0.76      7336
         gpe       0.95      0.73      0.82      2797
         nat       0.94      0.35      0.51        46
         org       0.88      0.53      0.66      6280
         per       0.93      0.67      0.78      5604
         tim       0.99      0.55      0.71      4533

    accuracy                           0.68     26788
   macro avg       0.68      0.45      0.52     26788
weighted avg       0.85      0.68      0.74     26788



In [36]:
1+1

2