In [21]:
#Regex module for checking alphanumeric values.
import re
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      #'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
  }

In [22]:
import codecs

In [23]:
filepath = "tamil_tagged_input.txt"
f = codecs.open(filepath, 'r', encoding='utf-8')
file_contents = f.readlines()
data=[]
words=[]
tags=[]
for x in range(len(file_contents)):
	line = file_contents.pop(0).strip().split('\\')
	word,tag=line
	if word!='.':
		words.append(word)
		tags.append(tag)
	else:
		words.append(word)
		tags.append(tag)
		data.append(tuple([words.copy(),tags.copy()]))

		words.clear()
		tags.clear()



In [24]:
len(data)

7154

In [25]:

def transform_to_dataset(tagged_sentences):
  X, y = [], []
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [],[]
    for index in range(len(sentence)):

        sent_word_features.append(extract_features(sentence, index)),
        sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X, y


train_size = int(0.8*len(data))
training = data[:train_size]
testing = data[train_size:]
X_train, y_train = transform_to_dataset(training)
X_test, y_test = transform_to_dataset(testing)


In [26]:
print(type(X_train))
print(X_train[0])
print(type(X_train[0]))
print(y_test[0])
print(type(X_test[0]))

<class 'list'>
[{'word': 'இல்\xadலையா', 'is_first': True, 'is_last': False, 'prefix-1': 'இ', 'prefix-2': 'இல', 'prefix-3': 'இல்\xad', 'suffix-1': 'ா', 'suffix-2': 'யா', 'suffix-3': 'லையா', 'prev_word': '', 'next_word': '', 'has_hyphen': False, 'is_numeric': False}, {'word': 'என்\xadபதை', 'is_first': False, 'is_last': False, 'prefix-1': 'எ', 'prefix-2': 'என', 'prefix-3': 'என்\xad', 'suffix-1': 'ை', 'suffix-2': 'தை', 'suffix-3': '\xadபதை', 'prev_word': 'இல்\xadலையா', 'next_word': '', 'has_hyphen': False, 'is_numeric': False}, {'word': 'மக்கள்', 'is_first': False, 'is_last': False, 'prefix-1': 'ம', 'prefix-2': 'மக', 'prefix-3': 'மக்க', 'suffix-1': '்', 'suffix-2': 'ள்', 'suffix-3': '்கள்', 'prev_word': 'என்\xadபதை', 'next_word': '', 'has_hyphen': False, 'is_numeric': False}, {'word': 'முன்\xadனி\xadலையில்', 'is_first': False, 'is_last': False, 'prefix-1': 'ம', 'prefix-2': 'மு', 'prefix-3': 'முன்', 'suffix-1': '்', 'suffix-2': 'ல்', 'suffix-3': 'யில்', 'prev_word': 'மக்கள்', 'next_word': '

In [None]:
#Ignoring some warnings for the sake of readability.
import warnings
warnings.filterwarnings('ignore')

#First, install sklearn_crfsuite, as it is not preloaded into Colab.
!pip install sklearn_crfsuite
from sklearn_crfsuite import CRF

#This loads the model. Specifics are:
#algorithm: methodology used to check if results are improving. Default is lbfgs (gradient descent).
#c1 and c2:  coefficients used for regularization.
#max_iterations: max number of iterations (DUH!)
#all_possible_transitions: since crf creates a "network", of probability transition states,
#this option allows it to map even "connections" not present in the data.


In [27]:

model = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
#The fit method is the default name used by Machine Learning algorithms to start training.

try:
    model.fit(X_train, y_train)
except AttributeError:
    print("ok")
    pass




In [37]:
import pickle
with open('pos_crf_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [28]:
#We'll use the sklearn_crfsuit own metrics to compute f1 score.
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
print("## Penn ##")

#First calculate a prediction from test data, then we print the metrics for f-1 using the .flat_f1_score method.
y_pred=model.predict(X_test)
print("F1 score on Test Data")
print(metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=model.classes_))
#For the sake of clarification, we do the same for train data.
y_pred_train=model.predict(X_train)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=model.classes_))



## Penn ##
F1 score on Test Data
0.8926912936590575
F1 score on Training Data 
0.991623748381328
