In [168]:
import nltk
import pandas as pd
import sklearn_crfsuite

from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split


In [169]:
def extract_features(tokens):
    features = []
    for i, token in enumerate(tokens):
        token_features = {
            'token': token,  # Current token
            'token_length': len(token['token']),  # Length of token
            'is_upper': token['token'].isupper(),  # Is token uppercase?
            'is_lower': token['token'].islower(),  # Is token lowercase?
            'is_digit': token['token'].isdigit(),  # Is token numeric?
            'prefix_1': token['token'][:1],  # First character
            'suffix_1': token['token'][-1:],  # Last character
        }

        # Add features from surrounding tokens (context)
        if i > 0:
            token_features.update({
                '-1:token': tokens[i - 1]['token'],
                '-1:is_upper': tokens[i - 1]['token'].isupper(),
            })
        else:
            token_features['BOS'] = True  # Beginning of Sequence

        if i < len(tokens) - 1:
            token_features.update({
                '+1:token': tokens[i + 1]['token'],
                '+1:is_upper': tokens[i + 1]['token'].isupper(),
            })
        else:
            token_features['EOS'] = True  # End of Sequence

        features.append(token_features)
    return features

In [170]:
data_train_path = "../data/cleaned_data/cleaned_data_train.csv"
data_test_path = "../data/cleaned_data/cleaned_data_test.csv"
data_dev_path = "../data/cleaned_data/cleaned_data_dev.csv"

In [171]:
def prepare_data(file_path):
    # Load your dataset
    data = pd.read_csv(file_path)
    
    # Initialize variables for storing sequences
    sentences = []
    labels = []
    sentence = []
    label_seq = []
    
    for _, row in data.iterrows():
        # Check for sentence end
        if row['token'] != '.':  # Replace '.' with your sentence delimiter, if any
            sentence.append({'token': row['token'], 'token_length': row['token_length']})
            label_seq.append(row['label'])
        else:
            # Add the completed sentence and labels to the lists
            sentences.append(sentence)
            labels.append(label_seq)
            sentence = []
            label_seq = []

    # Handle any leftover sequence (if the dataset doesn't end with a sentence delimiter)
    if sentence and label_seq:
        sentences.append(sentence)
        labels.append(label_seq)
    
    # Extract features for each sequence
    X = [extract_features(sentence) for sentence in sentences]
    y = labels
    
    return X, y

In [172]:
# Prepare training, testing, and dev datasets
X_train, y_train = prepare_data(data_train_path)
X_test, y_test = prepare_data(data_test_path)
X_dev, y_dev = prepare_data(data_dev_path)

In [173]:
# Print a sample to verify
print(X_train[0])
print(y_train[0])

[{'token': {'token': 'In', 'token_length': 2}, 'token_length': 2, 'is_upper': False, 'is_lower': False, 'is_digit': False, 'prefix_1': 'I', 'suffix_1': 'n', 'BOS': True, '+1:token': 'this', '+1:is_upper': False}, {'token': {'token': 'this', 'token_length': 4}, 'token_length': 4, 'is_upper': False, 'is_lower': True, 'is_digit': False, 'prefix_1': 't', 'suffix_1': 's', '-1:token': 'In', '-1:is_upper': False, '+1:token': 'article', '+1:is_upper': False}, {'token': {'token': 'article', 'token_length': 7}, 'token_length': 7, 'is_upper': False, 'is_lower': True, 'is_digit': False, 'prefix_1': 'a', 'suffix_1': 'e', '-1:token': 'this', '-1:is_upper': False, '+1:token': 'we', '+1:is_upper': False}, {'token': {'token': 'we', 'token_length': 2}, 'token_length': 2, 'is_upper': False, 'is_lower': True, 'is_digit': False, 'prefix_1': 'w', 'suffix_1': 'e', '-1:token': 'article', '-1:is_upper': False, '+1:token': 'discuss', '+1:is_upper': False}, {'token': {'token': 'discuss', 'token_length': 7}, 'tok

In [174]:
# Initialize CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # You can also try 'saga' for larger datasets
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,  # Maximum number of iterations
    all_possible_transitions=True
)

In [175]:
# Train the model
crf.fit(X_train, y_train)

In [176]:
# Predict on test data
y_pred = crf.predict(X_test)

# Calculate and display metrics
labels = list(crf.classes_)
labels.remove('O')  # Remove 'O' from evaluation
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

              precision    recall  f1-score   support

           B       0.67      0.56      0.61       445
           I       0.73      0.71      0.72       482

   micro avg       0.70      0.64      0.67       927
   macro avg       0.70      0.64      0.66       927
weighted avg       0.70      0.64      0.67       927



In [177]:
# Evaluate performance
print("Classification report:")
print(metrics.flat_classification_report(y_test, y_pred))

Classification report:
              precision    recall  f1-score   support

           B       0.67      0.56      0.61       445
           I       0.73      0.71      0.72       482
           O       0.91      0.93      0.92      2837

    accuracy                           0.86      3764
   macro avg       0.77      0.74      0.75      3764
weighted avg       0.86      0.86      0.86      3764



### Model Fine-tuning

In [178]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# Define parameter space
params_space = {
    'c1': [0.1, 0.2, 0.5, 1.0],
    'c2': [0.1, 0.2, 0.5, 1.0],
}

# Use RandomizedSearchCV for hyperparameter optimization
rs = RandomizedSearchCV(
    estimator=crf,
    param_distributions=params_space,
    cv=3,
    verbose=1,
    n_iter=10,
    scoring=make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
)

# Perform search
rs.fit(X_train, y_train)
print("Best hyperparameters:", rs.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best hyperparameters: {'c2': 0.2, 'c1': 0.2}


In [179]:
import joblib
# save best model
joblib.dump(rs.best_estimator_, '../models/trained_models/crf_model2.pkl')

['../models/trained_models/crf_model2.pkl']

In [180]:
import nltk

def predict_labels_from_sentence(crf_model, sentence, feature_extractor):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)
    
    # Extract features for the tokens
    token_features = feature_extractor(tokens)
    
    # Predict labels for the token sequence
    predicted_labels = crf_model.predict([token_features])[0]
    
    # Combine tokens and their predicted labels
    result = list(zip(tokens, predicted_labels))
    
    return result

In [181]:
def extract_predict_features(tokens):
    features = []
    for i, token in enumerate(tokens):
        token_features = {
            'token': token,  # Current token
            'token_length': len(token),  # Length of token
            'is_upper': token.isupper(),  # Is token uppercase?
            'is_lower': token.islower(),  # Is token lowercase?
            'is_digit': token.isdigit(),  # Is token numeric?
            'prev_token': '' if i == 0 else tokens[i - 1],  # Previous token
            'next_token': '' if i == len(tokens) - 1 else tokens[i + 1],  # Next token
        }
        features.append(token_features)
    return features

In [182]:
# Example Usage
sentence = "Part of Speech tagging is one of the tasks on which early Language models were tested for the GLUE score. In this article, we will learn about one such method which can be used for POS tagging. But before that let us understand what is POS tagging."
predicted_labels = predict_labels_from_sentence(rs, sentence, extract_predict_features)

# Print predictions
for token, label in predicted_labels:
    print(f"{token}: {label}")

Part: O
of: O
Speech: O
tagging: O
is: O
one: O
of: O
the: O
tasks: O
on: O
which: O
early: O
Language: O
models: O
were: O
tested: O
for: O
the: O
GLUE: O
score: O
.: O
In: O
this: O
article: O
,: O
we: O
will: O
learn: O
about: O
one: O
such: O
method: O
which: O
can: O
be: O
used: O
for: O
POS: O
tagging: O
.: O
But: O
before: O
that: O
let: O
us: O
understand: O
what: O
is: O
POS: B
tagging: I
.: I
