In [26]:
import nltk
from tqdm.notebook import tqdm

from datasets import load_dataset
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from nltk.tokenize import word_tokenize
from string import punctuation
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

In [27]:
nltk.download('punkt')
nltk.download("stopwords")
SEED = 0
Features_count = 6
SW = stopwords.words("english")
PUNCT = list(punctuation)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshvive14/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshvive14/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def createData(data):
    words = []
    features = []
    labels = []
    for d in tqdm(data):
        tags = d["ner_tags"]
        tokens = d["tokens"]
        for i in range(len(tokens)):
            x = vectorize(w = tokens[i], scaled_position = (i/len(tokens)))
            if tags[i] <= 0:
                y = 0
            else:
                y = 1
            features.append(x)
            labels.append(y)
        words += tokens
    words = np.asarray(words, dtype = "object")
    features = np.asarray(features, dtype = np.float32)
    labels = np.asarray(labels, dtype = np.float32)
    return words, features, labels

def vectorize(w, scaled_position):
    v = np.zeros(Features_count).astype(np.float32)
    title = 0
    allcaps = 0
    sw = 0
    punct = 0
    # If first character in uppercase
    if w[0].isupper():
        title = 1
    # All characters in uppercase
    if w.isupper():
        allcaps = 1
    # Is stopword
    if w.lower() in SW:
        sw = 1
    # Is punctuation
    if w in PUNCT:
        punct = 1
    return [title, allcaps, len(w), sw, punct, scaled_position]


def infer(model, scaler, s): # To perform inference
    tokens = word_tokenize(s)
    features = []
    l = len(tokens)
    for i in range(l):
        f = vectorize(w = tokens[i], scaled_position = (i/l))
        features.append(f)
    features = np.asarray(features, dtype = np.float32)
    scaled = scaler.transform(features)
    pred = model.predict(scaled)
    return pred, tokens, features


In [29]:
data = load_dataset("conll2003")

# <'id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'> 

data_train = data["train"] 
data_val   = data["validation"]
data_test  = data["test"]

words_train, X_train, y_train = createData(data_train)
words_val, X_val, y_val       = createData(data_val)
words_test, X_test, y_test    = createData(data_test)
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

model = SVC(C = 1.0, kernel = "linear", class_weight = "balanced", random_state = SEED, verbose = True)

# C : Regularization parameter.
# Verbose: To takes advantage of a per-process runtime setting in libsvm.

model.fit(X_train, y_train) # 'MODEL-TRAINING'
y_pred_val = model.predict(X_val)

nei_model_name = 'nei_model.sav'
pickle.dump(model, open(nei_model_name, 'wb'))

scaler_model_name = 'scaler_model.sav'
pickle.dump(scaler, open(scaler_model_name, 'wb'))

print(classification_report(y_true = y_val, y_pred = y_pred_val))


Found cached dataset conll2003 (/Users/harshvive14/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/14041 [00:00<?, ?it/s]

  0%|          | 0/3250 [00:00<?, ?it/s]

  0%|          | 0/3453 [00:00<?, ?it/s]

[LibSVM]..................
*.........
*
optimization finished, #iter = 27236
obj = -15443.285139, rho = -1.088682
nSV = 17088, nBSV = 9558
Total nSV = 17088
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98     42759
         1.0       0.82      0.97      0.89      8603

    accuracy                           0.96     51362
   macro avg       0.91      0.96      0.93     51362
weighted avg       0.96      0.96      0.96     51362



In [30]:
st.title("Named-Entity Identification")
input = st.text_input("Enter input string here: ")
if st.button("Process Text"):
    pred, tokens, features = infer(model, scaler, input)
    annotated = []
    for w, p in zip(tokens, pred):
        annotated.append(f"{w}_{int(p)}")
    output = " ".join(annotated)
    st.write(output)


NameError: name 'st' is not defined

In [32]:
input = "Harsh Vivek is smart and awesome"
nei_model = pickle.load(open("nei_model.sav", 'rb'))
scaler_model = pickle.load(open("scaler_model.sav", 'rb'))

pred, tokens, features = infer(nei_model, scaler_model, input)

In [33]:
pred, tokens, features

(array([1., 1., 0., 0., 0., 0.], dtype=float32),
 ['Harsh', 'Vivek', 'is', 'smart', 'and', 'awesome'],
 array([[1.        , 0.        , 5.        , 0.        , 0.        ,
         0.        ],
        [1.        , 0.        , 5.        , 0.        , 0.        ,
         0.16666667],
        [0.        , 0.        , 2.        , 1.        , 0.        ,
         0.33333334],
        [0.        , 0.        , 5.        , 0.        , 0.        ,
         0.5       ],
        [0.        , 0.        , 3.        , 1.        , 0.        ,
         0.6666667 ],
        [0.        , 0.        , 7.        , 0.        , 0.        ,
         0.8333333 ]], dtype=float32))

In [34]:
annotated = []
for w, p in zip(tokens, pred):
    annotated.append(f"{w}_{int(p)}")
output = " ".join(annotated)

In [35]:
output

'Harsh_1 Vivek_1 is_0 smart_0 and_0 awesome_0'