# Conditional Random Fields

Generally in other models we do stemming and lemmatization or get vocabulary which makes our data lose its sequentiality or pattern. Which results in a loss of information or context.  CRF tells us that by having a sequential pattern i.e. having the labels of the nearby data(could be tokens or images) a lot of information is taken into account for predicting the labels on the desired column of data.

CRF uses parts of speech of the the tokens created from our text to learn the patterns of how each of these tokens occur in different contexts. It can then predict the labels on new data.

<b>In this application our aim is to successfully train our model to identify account number out of the text given as an input. For this we will use the sklearn_crfsuite and nltk. <b>

# Load necessary modules

In [None]:
!pip install sklearn-crfsuite
import sklearn_crfsuite
import nltk



# Initialize the training data

In [None]:
nltk.download('punkt')
stmt1='My account number is SB100-abc-200'
stmt2='Can you please tell the balance for CA499-243-520'
stmt3='Why is there a debit on my account CC467-923-624 on 10-09-2018 ?'
mydata = []
stmtlist = [stmt1, stmt2, stmt3]
for stmt in stmtlist:

  stmtword = nltk.word_tokenize(stmt)
  stmtl = ['NAA' for x in stmtword]
  mydata.append(stmtword)
  mydata.append(stmtl)

print(mydata)

[['My', 'account', 'number', 'is', 'SB100-abc-200'], ['NAA', 'NAA', 'NAA', 'NAA', 'NAA'], ['Can', 'you', 'please', 'tell', 'the', 'balance', 'for', 'CA499-243-520'], ['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA'], ['Why', 'is', 'there', 'a', 'debit', 'on', 'my', 'account', 'CC467-923-624', 'on', '10-09-2018', '?'], ['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA']]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
data=[(['My', 'account', 'number', 'is', 'SB100-abc-200'], ['NAA', 'NAA', 'NAA', 'NAA', 'ACCT']),
(['Can', 'you', 'please', 'tell', 'the', 'balance', 'for', 'CA499-243-520'],['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'ACCT']),
(['Why', 'is', 'there', 'a', 'debit', 'on', 'my', 'account', 'CC467-923-624','on', '10-09-2018', '?'],
['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'ACCT', 'NAA', 'NAA', 'NAA'])]


# Create the corpus

In [None]:
def create_corpus (data):

  lcorpus = []
  for (doc, tags) in data:

    doc_tag = []
    for word, tag in zip(doc, tags):

      doc_tag.append((word,tag))
    lcorpus.append(doc_tag)

  return lcorpus

corpus = create_corpus(data)
print(corpus)

[[('My', 'NAA'), ('account', 'NAA'), ('number', 'NAA'), ('is', 'NAA'), ('SB100-abc-200', 'ACCT')], [('Can', 'NAA'), ('you', 'NAA'), ('please', 'NAA'), ('tell', 'NAA'), ('the', 'NAA'), ('balance', 'NAA'), ('for', 'NAA'), ('CA499-243-520', 'ACCT')], [('Why', 'NAA'), ('is', 'NAA'), ('there', 'NAA'), ('a', 'NAA'), ('debit', 'NAA'), ('on', 'NAA'), ('my', 'NAA'), ('account', 'NAA'), ('CC467-923-624', 'ACCT'), ('on', 'NAA'), ('10-09-2018', 'NAA'), ('?', 'NAA')]]


# Extract Features for training the model

In [None]:
def convert_document_to_feature_functions(document, i):

  word = document[i][0]
  features = {
      'Currword': word,
  }
  # Features from previous word
  if i > 0:

    prevword = document[i-1][0]
    features['Prevword'] = prevword

  else:

    # Special "Beginning of Sequence" tag
    features['BOS'] = True

  if i < len(document) - 1:

    nextword = document[i+1][0]
    features['Nextword'] = nextword

  else:

    features['EOS'] = True

  return features


In [None]:
def extract_features(doc):
  print('From extract features: ', doc, ': length ' , len(doc))
  features = []
  for i in range(len(doc)):
    feat = convert_document_to_feature_functions(doc, i)
    features.append(feat)
  return (features)

In [None]:
X = [extract_features(doc) for doc in corpus]
for x in X:
  for dictx in x:
    print(dictx)
    print(" ------------------------------------------ ")

From extract features:  [('My', 'NAA'), ('account', 'NAA'), ('number', 'NAA'), ('is', 'NAA'), ('SB100-abc-200', 'ACCT')] : length  5
From extract features:  [('Can', 'NAA'), ('you', 'NAA'), ('please', 'NAA'), ('tell', 'NAA'), ('the', 'NAA'), ('balance', 'NAA'), ('for', 'NAA'), ('CA499-243-520', 'ACCT')] : length  8
From extract features:  [('Why', 'NAA'), ('is', 'NAA'), ('there', 'NAA'), ('a', 'NAA'), ('debit', 'NAA'), ('on', 'NAA'), ('my', 'NAA'), ('account', 'NAA'), ('CC467-923-624', 'ACCT'), ('on', 'NAA'), ('10-09-2018', 'NAA'), ('?', 'NAA')] : length  12
{'Currword': 'My', 'BOS': True, 'Nextword': 'account'}
 ------------------------------------------ 
{'Currword': 'account', 'Prevword': 'My', 'Nextword': 'number'}
 ------------------------------------------ 
{'Currword': 'number', 'Prevword': 'account', 'Nextword': 'is'}
 ------------------------------------------ 
{'Currword': 'is', 'Prevword': 'number', 'Nextword': 'SB100-abc-200'}
 ------------------------------------------ 
{'

# Extract Labels to feed

In [None]:
def sentence_labels(doc):
  return [tag for (token,tag) in doc]
y = [sentence_labels(doc) for doc in corpus]
print(y)


[['NAA', 'NAA', 'NAA', 'NAA', 'ACCT'], ['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'ACCT'], ['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'ACCT', 'NAA', 'NAA', 'NAA']]


#  Let us now train our CRF model with the obtained X and y

In [None]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/api.html

crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.1, # L1-regularization
    c2 = 0.1, # L2-regularization
    max_iterations = 20,
    all_possible_transitions=False,
)
crf.fit(X, y)


AttributeError: ignored

AttributeError: ignored

AttributeError: ignored

In [None]:
TestText = 'Please tell the balance for my account LA233-273-120'
test_token = nltk.word_tokenize(TestText)
test=[(x, 'No_LABEL') for x in test_token]


In [None]:
tcorpus = create_corpus(test)
X_test = extract_features(test)
print(TestText)
print(crf.predict_single(X_test))

From extract features:  [('Please', 'No_LABEL'), ('tell', 'No_LABEL'), ('the', 'No_LABEL'), ('balance', 'No_LABEL'), ('for', 'No_LABEL'), ('my', 'No_LABEL'), ('account', 'No_LABEL'), ('LA233-273-120', 'No_LABEL')] : length  8
Please tell the balance for my account LA233-273-120
['NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'NAA', 'ACCT']
