# CRF Hindi POS Tagger

In [None]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 9.7MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import pandas as pd
from io import StringIO

from collections import Counter

In [None]:
#importing dataset
from google.colab import files
uploaded = files.upload()

Saving hi-ud-test .conllu to hi-ud-test .conllu
Saving hi-ud-train.conllu to hi-ud-train.conllu


In [None]:
train_data = uploaded['hi-ud-train.conllu'].decode("utf-8")
test_data = uploaded['hi-ud-test .conllu'].decode("utf-8")

train_df = pd.read_csv(StringIO(train_data), sep=',')
test_df = pd.read_csv(StringIO(test_data), sep='\t')

### Pre-Processing Data

In [None]:
def preProcessDF(df):
  processed_sentences = []
  tags_list = []
  sentence = []
  sentence_tags = []
  for i in range(0, len(df)):
    if not (pd.isna(df.iloc[i, 0])):
      # ',' are missing with POS tag COMMA
      if (df.iloc[i, 2] == 'COMMA'):
        sentence.append(',')
        sentence_tags.append(df.iloc[i, 2])
      else:
        sentence.append(str(df.iloc[i, 1]))
        sentence_tags.append(df.iloc[i, 2])
    else:
      if(len(sentence)):
        processed_sentences.append(sentence.copy())
        tags_list.append(sentence_tags.copy())
        sentence.clear()
        sentence_tags.clear()
  
  if (len(sentence)):
    processed_sentences.append(sentence.copy())
    tags_list.append(sentence_tags.copy())
    sentence.clear()
    sentence_tags.clear()

  return processed_sentences, tags_list

### Features

<b>word.lower:</b> word in lowercase. <br />
<b>word:</b> word without any modifications as it contains important information (eg: information about matras). <br />
<b>is_first:</b> True if word is at the start of the sentence. Some part of speech usually appear at the start. (eg: In hindi sentences usually the first word is noun). <br />
<b>is_last:</b> True if word is at the end of sentence. Some part of speech usually appear at the end. <br />
<b>prev_word and next_word:</b> word preceding and following the current word. POS of a word depends on the context. <br />
<b>prev_prev_word and next_next_word: </b> (i-2)th word and (i+2)th word if the current word is at i. POS of a word depends on the context. This is also helpful for words that appear in pair having same pos tag and are separated by special characters (eg: din-rath). <br />
<b>prefix and suffix: </b> prefix and suffix are usually 1-4 characters long, so we are taking all possibilities. words with similar prefix and suffix usually have similar POS tags. Also sometimes the addition of prefix and suffix to a word also changes its pos tag. <br />
<b>is_number: </b> True if the current word is numeric. If the word is numeric then its pos tag will be NUM. <br />
<b>is_alphanumeric: </b> True if the current word is alphanumeric. If a word is not alphanumeric than it can be a special symbol like COMMA, FULL STOP, etc <br />

In [None]:
def extractTokenFeatures(sentence, index):
  features = {
      'word': sentence[index],
      'word.lower': sentence[index].lower(),
      'is_first': index == 0,
      'is_last': index == len(sentence) - 1,
      'prev_word':'' if index == 0 else sentence[index - 1],
      'next_word':'' if index == len(sentence) - 1 else sentence[index + 1],
      'prev_prev_word':'' if index <= 1 else sentence[index - 2],
      'next_next_word':'' if index >= len(sentence) - 2 else sentence[index + 2],
      'prefix-1': sentence[index][:1],
      'prefix-2': sentence[index][:2],
      'prefix-3': sentence[index][:3],
      'prefix-4': sentence[index][:4],
      'suffix-1': sentence[index][-1:],
      'suffix-2': sentence[index][-2:],
      'suffix-3': sentence[index][-3:],
      'suffix-4': sentence[index][-4:],
      'is_number': sentence[index].isdigit(),
      'is_alphanumeric': sentence[index].isalnum()
  }

  return features

In [None]:
def convertToInputFormat(sentences, tags_list):
  X = []
  Y = []
  for i in range(len(sentences)):
    sentence_features = []
    sentence_tags = []
    for j in range(len(sentences[i])):
      sentence_features.append(extractTokenFeatures(sentences[i], j))
      sentence_tags.append(tags_list[i][j])
    X.append(sentence_features)
    Y.append(sentence_tags)
  return X, Y

In [None]:
train_sentences, train_tags_list = preProcessDF(train_df)
test_sentences, test_tags_list = preProcessDF(test_df)

X_train, Y_train = convertToInputFormat(train_sentences, train_tags_list)
X_test, Y_test = convertToInputFormat(test_sentences, test_tags_list)

### CRF Model

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.018407615768017527,
    c2=0.07875523994164575,
    max_iterations=100,
    all_possible_transitions=True
)

In [None]:
crf.fit(X_train, Y_train)

class_labels = crf.classes_

In [None]:
# Used this commented code for tuning hyperparameters
# After tuning parameters c1 = 0.018407615768017527 and c2 = 0.07875523994164575
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=class_labels)

# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X_train, Y_train)

# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     c1=rs.best_params_['c1'],
#     c2=rs.best_params_['c2'],
#     max_iterations=100,
#     all_possible_transitions=True
# )

# crf.fit(X_train, Y_train)

In [None]:
Y_train_pred = crf.predict(X_train)
print('Training Accuracy: ' + str(metrics.flat_f1_score(Y_train, Y_train_pred,average='weighted',labels=class_labels)))

Training Accuracy: 0.9997371965633727


In [None]:
Y_test_pred = crf.predict(X_test)
print('Test Accuracy: ' + str(metrics.flat_f1_score(Y_test, Y_test_pred,average='weighted',labels=class_labels)))

Test Accuracy: 0.8622969916898243


  average, "true nor predicted", 'F-score is', len(true_sum)


In [None]:
print('Precision, Recall, F-Score per token for training data')
print(metrics.flat_classification_report(Y_train, Y_train_pred, labels=class_labels, digits=3))

Precision, Recall, F-Score per token for training data
              precision    recall  f1-score   support

         DET      1.000     1.000     1.000       231
       PROPN      1.000     1.000     1.000       708
         ADP      1.000     1.000     1.000      1387
         ADV      1.000     1.000     1.000       111
         ADJ      1.000     1.000     1.000       570
        NOUN      1.000     1.000     1.000      1597
         NUM      1.000     1.000     1.000       152
         AUX      0.997     1.000     0.999       730
       PUNCT      1.000     1.000     1.000       564
        PRON      1.000     1.000     1.000       431
        VERB      1.000     0.997     0.998       640
       CCONJ      1.000     1.000     1.000       150
        PART      1.000     1.000     1.000       163
       COMMA      1.000     1.000     1.000       114
       SCONJ      1.000     1.000     1.000        61
           X      1.000     1.000     1.000         2

    accuracy             

In [None]:
print('Precision, Recall, F-Score per token for test data')
print(metrics.flat_classification_report(Y_test, Y_test_pred, labels=class_labels, digits=3))

Precision, Recall, F-Score per token for test data
              precision    recall  f1-score   support

         DET      0.842     0.889     0.865        36
       PROPN      0.670     0.531     0.592       145
         ADP      0.968     0.974     0.971       309
         ADV      0.733     0.524     0.611        21
         ADJ      0.698     0.713     0.705        94
        NOUN      0.784     0.894     0.835       329
         NUM      0.957     0.880     0.917        25
         AUX      0.971     0.957     0.964       139
       PUNCT      1.000     0.837     0.911       135
        PRON      0.948     0.846     0.894        65
        VERB      0.914     0.859     0.885        99
       CCONJ      1.000     1.000     1.000        25
        PART      1.000     1.000     1.000        33
       COMMA      0.000     0.000     0.000         0
       SCONJ      0.750     1.000     0.857         3
           X      0.000     0.000     0.000         0

   micro avg      0.858     0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def printTransitions(transitions):
    for (label_from, label_to), weight in transitions:
        print(label_from + " -> " + label_to + ", weight: " + str(weight))

print('10 most common transition features')
printTransitions(Counter(crf.transition_features_).most_common(10))
print()

print('10 least common transition features')
printTransitions(Counter(crf.transition_features_).most_common()[-10:])

10 most common transition features
VERB -> AUX, weight: 4.013814
PROPN -> PROPN, weight: 2.722663
ADJ -> NOUN, weight: 2.461515
NUM -> NOUN, weight: 2.190993
DET -> NOUN, weight: 1.993408
PROPN -> ADP, weight: 1.885115
NOUN -> ADP, weight: 1.836157
AUX -> AUX, weight: 1.816286
PRON -> ADP, weight: 1.809544
PART -> NUM, weight: 1.533121

10 least common transition features
PROPN -> NOUN, weight: -1.139805
PRON -> CCONJ, weight: -1.194631
VERB -> VERB, weight: -1.225076
CCONJ -> AUX, weight: -1.229738
PROPN -> DET, weight: -1.336203
PROPN -> AUX, weight: -1.37209
CCONJ -> PART, weight: -1.443849
ADJ -> PRON, weight: -1.661681
ADJ -> ADP, weight: -2.021156
DET -> ADP, weight: -2.213404
