# Model for POS Tagging

## This section is for the development of the CRF Model

First, we import and download all the necessary dependancies if needed

In [None]:
# Install the sklearn-crfsuite package
%pip install --upgrade pip
%pip install sklearn-crfsuite


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import ast
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

We will then prepare the data by retrieving it from the cleaned csv files, gathering the required data.

In [3]:

def convert_to_list_of_dicts(string):
    feats_list = ast.literal_eval(string)

    converted_feats = []
    for feat in feats_list:
        if feat == 'None':
            converted_feats.append(None)
        else:
            converted_feats.append(ast.literal_eval(feat))
    return converted_feats 

def convert_to_list(string):
    return ast.literal_eval(string)

# Add the project root directory to sys.path
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '../'))
sys.path.append(project_root)

# Load training dataset
file_path = os.path.join(project_root, 'data/cleaned/train_cleaned.csv')
training_df = pd.read_csv(file_path, converters=
                 {'tokens': convert_to_list, 
                  'lemmas': convert_to_list, 
                  'upos': convert_to_list, 
                  'xpos': convert_to_list, 
                  'feats': convert_to_list_of_dicts,
                  'head': convert_to_list,
                  'deprel': convert_to_list,
                  'deps': convert_to_list,
                  'misc': convert_to_list})

required_headers = ['deps', 'misc', 'idx', 'text', 'head', 'deprel', 'upos']
if all(header in training_df.columns for header in required_headers):
    training_df = training_df.drop(columns=required_headers)
else:
    print("Already dropped the training header columns")

# Load test dataset
file_path = os.path.join(project_root, 'data/cleaned/test_cleaned.csv')
test_df = pd.read_csv(file_path, converters=
                 {'tokens': convert_to_list, 
                  'lemmas': convert_to_list, 
                  'upos': convert_to_list, 
                  'xpos': convert_to_list, 
                  'feats': convert_to_list_of_dicts,
                  'head': convert_to_list,
                  'deprel': convert_to_list,
                  'deps': convert_to_list,
                  'misc': convert_to_list})

if all(header in test_df.columns for header in required_headers):
    test_df = test_df.drop(columns=required_headers)
else:
    print("Already dropped the test header columns")

# Load validation dataset
file_path = os.path.join(project_root, 'data/cleaned/validation_cleaned.csv')
validation_df = pd.read_csv(file_path, converters=
                 {'tokens': convert_to_list, 
                  'lemmas': convert_to_list, 
                  'upos': convert_to_list, 
                  'xpos': convert_to_list, 
                  'feats': convert_to_list_of_dicts,
                  'head': convert_to_list,
                  'deprel': convert_to_list,
                  'deps': convert_to_list,
                  'misc': convert_to_list})

if all(header in validation_df.columns for header in required_headers):
    validation_df = validation_df.drop(columns=required_headers)
else:
    print("Already dropped the validation header columns")


Prep the data get a list of tokens,tags and features where at any index, they correspond to each other and we get rid of any 'None' POS Tags

In [26]:
# Remove None or empty string values from pos tags and corresponding tokens
# This is necessary because the Universal POS tags are not available for all tokens

def get_flattened_filtered_columns(df):
    filtered_tokens = []
    filtered_xpos_tags = []
    filtered_lemmas = []
    filtered_feats = []

    for i in range(df['tokens'].size):
        tags = df['xpos'][i]
        tokens = df['tokens'][i]
        lemmas = df['lemmas'][i]
        features = df['feats'][i]
        for token, tag, lemma, feature in zip(tokens, tags, lemmas, features):
            if tag is not None and tag != "":
                filtered_tokens.append(token)
                filtered_xpos_tags.append(tag)
                filtered_lemmas.append(lemma)
                filtered_feats.append(feature)
    
    print(f"Number of tokens: {len(filtered_tokens)}")
    print(f"Number of XPOS tags: {len(filtered_xpos_tags)}")
    print(f"Number of LEMMAS: {len(filtered_lemmas)}")
    print(f"Number of FEATS: {len(filtered_feats)}")
    for i in range(10): 
        print(filtered_tokens[i], filtered_xpos_tags[i], filtered_lemmas[i], filtered_feats[i])

    return filtered_tokens, filtered_xpos_tags, filtered_lemmas, filtered_feats

trainingFilteredTokens, trainingFilteredTags, trainingFilteredLemmas, trainingFilteredFeats = get_flattened_filtered_columns(training_df)

testFilteredTokens, testFilteredTags, testFilteredLemmas, testFilteredFeats = get_flattened_filtered_columns(test_df)

validationFilteredTokens, validationFilteredTags, validationFilteredLemmas,  validationFilteredFeats = get_flattened_filtered_columns(validation_df)
# Verify the lengths after filtering


Number of tokens: 204609
Number of XPOS tags: 204609
Number of LEMMAS: 204609
Number of FEATS: 204609
Al NNP Al {'Number': 'Sing'}
- HYPH - None
Zaman NNP Zaman {'Number': 'Sing'}
: : : None
American JJ american {'Degree': 'Pos'}
forces NNS force {'Number': 'Plur'}
killed VBD kill {'Mood': 'Ind', 'Tense': 'Past', 'VerbForm': 'Fin'}
Shaikh NNP Shaikh {'Number': 'Sing'}
Abdullah NNP Abdullah {'Number': 'Sing'}
al NNP al {'Number': 'Sing'}
Number of tokens: 25097
Number of XPOS tags: 25097
Number of LEMMAS: 25097
Number of FEATS: 25097
What WP what {'PronType': 'Int'}
if IN if None
Google NNP Google {'Number': 'Sing'}
Morphed VBD morph {'Mood': 'Ind', 'Tense': 'Past', 'VerbForm': 'Fin'}
Into IN into None
GoogleOS NNP GoogleOS {'Number': 'Sing'}
? . ? None
What WP what {'PronType': 'Int'}
if IN if None
Google NNP Google {'Number': 'Sing'}
Number of tokens: 25150
Number of XPOS tags: 25150
Number of LEMMAS: 25150
Number of FEATS: 25150
From IN from None
the DT the {'Definite': 'Def', 'PronT

Now in order to create the crf model, we need to extract features. We've decided to extract a reasonable amount of features as to prevent overfitting issues.

# Which Features should I extract?

In [27]:
# Feature extraction function
def extract_features(tokens, lemmas, feats, i):
    token = tokens[i]
    lemma = lemmas[i]
    feature_dict = feats[i] if feats[i] is not None or "" else {}
    features = {
        'bias': 1.0,
        'token.lower()': token.lower(),
        'token.isupper()': token.isupper(),
        'token.istitle()': token.istitle(),
        'token.isdigit()': token.isdigit(),
        'prefix-1': token[:1],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1:],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'token.length': len(token),
        'lemma': lemma.lower(),
    }
    if i > 0:
        token1 = tokens[i-1]
        lemma1 = lemmas[i-1]
        features.update({
            '-1:token.lower()': token1.lower(),
            '-1:token.istitle()': token1.istitle(),
            '-1:token.isupper()': token1.isupper(),
            '-1:lemma': lemma1.lower(),
        })
    else:
        features['BOS'] = True

    if i < len(tokens)-1:
        token1 = tokens[i+1]
        lemma1 = lemmas[i+1]
        features.update({
            '+1:token.lower()': token1.lower(),
            '+1:token.istitle()': token1.istitle(),
            '+1:token.isupper()': token1.isupper(),
            '+1:lemma': lemma1.lower(),
        })
    else:
        features['EOS'] = True

    features.update(feature_dict)  # Include morphological features

    return [features]


def extract_features_for_sentence(tokens, lemmas, feats):
    return [extract_features(tokens, lemmas, feats, i) for i in range(len(tokens))]

def get_labels(tags):
    return [tags]

In [28]:

# Prepare the training data for CRF
training_x = extract_features_for_sentence(trainingFilteredTokens, trainingFilteredLemmas, trainingFilteredFeats)
training_y = [get_labels(tags) for tags in trainingFilteredTags]

for i in range(10):
    print(training_x[i])
    print(training_y[i])

[{'bias': 1.0, 'token.lower()': 'al', 'token.isupper()': False, 'token.istitle()': True, 'token.isdigit()': False, 'prefix-1': 'A', 'prefix-2': 'Al', 'prefix-3': 'Al', 'suffix-1': 'l', 'suffix-2': 'Al', 'suffix-3': 'Al', 'token.length': 2, 'lemma': 'al', 'BOS': True, '+1:token.lower()': '-', '+1:token.istitle()': False, '+1:token.isupper()': False, '+1:lemma': '-', 'Number': 'Sing'}]
['NNP']
[{'bias': 1.0, 'token.lower()': '-', 'token.isupper()': False, 'token.istitle()': False, 'token.isdigit()': False, 'prefix-1': '-', 'prefix-2': '-', 'prefix-3': '-', 'suffix-1': '-', 'suffix-2': '-', 'suffix-3': '-', 'token.length': 1, 'lemma': '-', '-1:token.lower()': 'al', '-1:token.istitle()': True, '-1:token.isupper()': False, '-1:lemma': 'al', '+1:token.lower()': 'zaman', '+1:token.istitle()': True, '+1:token.isupper()': False, '+1:lemma': 'zaman'}]
['HYPH']
[{'bias': 1.0, 'token.lower()': 'zaman', 'token.isupper()': False, 'token.istitle()': True, 'token.isdigit()': False, 'prefix-1': 'Z', 'p

In [29]:
# Prepare the test data for CRF
test_x = extract_features_for_sentence(testFilteredTokens, testFilteredLemmas, testFilteredFeats)
test_y = [get_labels(tags) for tags in testFilteredTags]

# Prepare the validation data for CRF
validation_x = extract_features_for_sentence(validationFilteredTokens, validationFilteredLemmas, validationFilteredFeats)
validation_y = [get_labels(tags) for tags in validationFilteredTags]

Now Let's train the model

In [30]:
# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(training_x, training_y)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    max_iterations=100)

In [31]:

# Predict on the test set
y_pred = crf.predict(test_x)

# Evaluate the model
labels = list(crf.classes_)
metrics.flat_f1_score(test_y, y_pred, average='weighted', labels=labels)

# Print classification report
print(metrics.flat_classification_report(test_y, y_pred, labels=labels, digits=3))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         NNP      0.896     0.885     0.891      1997
        HYPH      0.736     0.796     0.765        98
           :      0.914     0.850     0.881       100
          JJ      0.997     0.997     0.997      1560
         NNS      0.978     0.981     0.980       905
         VBD      1.000     1.000     1.000       529
           ,      0.947     0.974     0.961       980
          DT      0.993     0.997     0.995      1952
          NN      0.932     0.941     0.936      3320
          IN      0.978     0.978     0.978      2314
           .      0.995     0.991     0.993      1451
       -LRB-      1.000     1.000     1.000       114
          MD      1.000     1.000     1.000       400
          VB      1.000     1.000     1.000      1127
         VBG      1.000     1.000     1.000       342
         PRP      1.000     1.000     1.000      1426
          TO      0.932     0.984     0.957       375
       -RRB-      1.000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
from sklearn.metrics import confusion_matrix

# Plot the confusion matrix for the test set
def plot_confusion_matrix(test_y, y_pred, labels):
    y_true_flat = [item for sublist in y_true for item in sublist]
    y_pred_flat = [item for sublist in y_pred for item in sublist]
    
    # Compute the confusion matrix
    cm = confusion_matrix(y_true_flat, y_pred_flat, labels=labels)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

plot_confusion_matrix(test_y, y_pred, labels)

NameError: name 'y_true' is not defined

In [None]:

# Predict on the validation set
y_pred = crf.predict(validation_x)

# Evaluate the model
labels = list(crf.classes_)
metrics.flat_f1_score(validation_y, y_pred, average='weighted', labels=labels)

# Print classification report
print(metrics.flat_classification_report(validation_y, y_pred, labels=labels, digits=3))