# Train Naïve Bayes Classifier for Entity Extraction

**1. Load libraries**

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

**2. Load data**

In [23]:
train_input_path ="../data/tokenized_data_v1.csv"
eval_input_path ="../data/tokenized_data_eval.csv"

df_tokenized = pd.read_csv(train_input_path)
df_tokenized_eval = pd.read_csv(eval_input_path)

print('Training Dataset Size: ',f"{len(df_tokenized):,}")
print('Evaluation Dataset Size: ',f"{len(df_tokenized_eval):,}")

Training Dataset Size:  18,648,228
Evaluation Dataset Size:  827


In [24]:
df_tokenized.head()

Unnamed: 0,Sentence,Word,Entity,POS
0,1,help,O,VERB
1,1,me,O,PRON
2,1,automatically,O,ADV
3,1,turn,ACTION,VERB
4,1,the,O,PRON


**3. Separate features and label**

In [8]:
X = df_tokenized.drop('Entity', axis=1)
X.head()

Unnamed: 0,Sentence,Word
0,1,help
1,1,me
2,1,automatically
3,1,turn
4,1,the


In [9]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(827, 261)

In [10]:
y = df_tokenized.Entity.values

In [11]:
classes = np.unique(y)

In [12]:
classes = classes.tolist()
classes

['ACTION',
 'DEVICE',
 'LOCATION',
 'O',
 'SERVICE',
 'STATE',
 'TIME',
 'UNIT',
 'VAL']

In [13]:
print('Features Dimensions: ',X.shape)
print('Target Dimensions: ',y.shape)

Features Dimensions:  (827, 261)
Target Dimensions:  (827,)


**4. Train-Test Split**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [16]:
X_train.shape, y_train.shape

((554, 261), (554,))

**5. Train Naive Bayes classifier**

In [17]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

# Evaluation
### On test split

In [18]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = classes))

              precision    recall  f1-score   support

      ACTION       1.00      0.80      0.89        25
      DEVICE       0.25      0.38      0.30         8
    LOCATION       1.00      0.41      0.58        17
           O       0.84      0.98      0.90       192
     SERVICE       0.00      0.00      0.00         1
       STATE       0.00      0.00      0.00         4
        TIME       0.00      0.00      0.00         2
        UNIT       0.00      0.00      0.00         2
         VAL       1.00      0.23      0.37        22

    accuracy                           0.82       273
   macro avg       0.45      0.31      0.34       273
weighted avg       0.83      0.82      0.79       273



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### on evaludation data

In [30]:
X_eval = df_tokenized_eval.drop('Entity', axis=1)
X_eval = v.transform(X_eval.to_dict('records'))
y_eval = df_tokenized_eval.Entity.values

print(classification_report(y_pred=nb.predict(X_eval), y_true=y_eval, labels = classes))

              precision    recall  f1-score   support

      ACTION       1.00      0.93      0.96        83
      DEVICE       0.79      0.87      0.83        39
    LOCATION       1.00      0.78      0.87        58
           O       0.92      0.99      0.96       546
     SERVICE       0.00      0.00      0.00         1
       STATE       0.40      0.20      0.27        10
        TIME       1.00      0.33      0.50         3
        UNIT       1.00      0.89      0.94        18
         VAL       1.00      0.75      0.86        69

    accuracy                           0.93       827
   macro avg       0.79      0.64      0.69       827
weighted avg       0.93      0.93      0.93       827



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
