In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Function to convert data to features suitable for CRF
def create_features(data):
    features = []
    for i in range(len(data)):
        word = data[i][0]
        token_length = int(data[i][2])  # Token length as an integer
        word_features = {
            'word': word,
            'length': token_length,
            'is_alpha': word.isalpha(),
            'is_stop': word.lower() in stop_words,
            'has_digit': any(char.isdigit() for char in word)
        }
        features.append(word_features)
    return features

# Function to extract labels
def extract_labels(data):
    return [item[1] for item in data]

# Stop words to exclude common words (can customize based on your dataset)
stop_words = set([
    'the', 'and', 'in', 'to', 'of', 'we', 'that', 'is', 'on', 'for', 'with', 'as', 'at', 'from', 'a'
])


In [3]:
train_df = pd.read_csv('../data/cleaned_data/cleaned_data_train.csv')
test_df = pd.read_csv('../data/cleaned_data/cleaned_data_test.csv')

In [4]:
# Convert the dataframes to the expected format
train_data = train_df.values.tolist()
test_data = test_df.values.tolist()

# Split the data into features and labels
X_train = [create_features(train_data)]
y_train = [extract_labels(train_data)]


X_test = [create_features(test_data)]
y_test = [extract_labels(test_data)]

In [5]:
X_train

[[{'word': 'In',
   'length': 2,
   'is_alpha': True,
   'is_stop': True,
   'has_digit': False},
  {'word': 'this',
   'length': 4,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'article',
   'length': 7,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'we',
   'length': 2,
   'is_alpha': True,
   'is_stop': True,
   'has_digit': False},
  {'word': 'discuss',
   'length': 7,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'several',
   'length': 7,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'metrics',
   'length': 7,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'of',
   'length': 2,
   'is_alpha': True,
   'is_stop': True,
   'has_digit': False},
  {'word': 'coherence',
   'length': 9,
   'is_alpha': True,
   'is_stop': False,
   'has_digit': False},
  {'word': 'defined',
   'length': 7,
   'is_alpha': True,
   'is_stop': False,
   

In [6]:
# Initialize CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # You can also try 'saga' for larger datasets
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,  # Maximum number of iterations
    all_possible_transitions=True
)

In [7]:
# Train the model
crf.fit(X_train, y_train)

In [9]:
# save crf model
import joblib

joblib.dump(crf, '../models/trained_models/crf_model.pkl')

['../models/trained_models/crf_model.pkl']

In [10]:
# Test the model
y_pred = crf.predict(X_test)

In [11]:
# Evaluate performance
print("Classification report:")
print(metrics.flat_classification_report(y_test, y_pred))

Classification report:
              precision    recall  f1-score   support

           B       0.65      0.47      0.54       445
           I       0.65      0.66      0.65       482
           O       0.89      0.93      0.91      2972

    accuracy                           0.84      3899
   macro avg       0.73      0.69      0.70      3899
weighted avg       0.83      0.84      0.84      3899



In [None]:
def inference(text):
    test_sentence = text.split(" ")
    # Prepare features for each word in the sentence
    test_features = create_features([(word, '', len(word)) for word in test_sentence])
    # Get the predicted labels from the CRF model
    test_pred = crf.predict([test_features])[0]


    # Display the word and its predicted label
    print("Predictions for new sentence:")
    for word, label in zip(test_sentence, test_pred):
        print(f"Word: {word} -> {label}")

In [15]:
text = "Natural language processing (NLP) combines computational linguistics, machine learning, and deep learning models"

inference(text)

array(['B', 'I', 'I', 'O', 'O', 'B', 'I', 'I', 'I', 'O', 'B', 'I', 'I'],
      dtype=object)

In [None]:
text = "This is the selection of a word meaning for a word with multiple possible meanings. This uses a process of semantic analysis to examine the word in context."

inference(text)

Predictions for new sentence:
Word: This -> O
Word: is -> O
Word: the -> O
Word: selection -> O
Word: of -> O
Word: a -> O
Word: word -> O
Word: meaning -> O
Word: for -> O
Word: a -> O
Word: word -> O
Word: with -> O
Word: multiple -> O
Word: possible -> O
Word: meanings. -> O
Word: This -> O
Word: uses -> O
Word: a -> O
Word: process -> O
Word: of -> O
Word: semantic -> B
Word: analysis -> I
Word: to -> O
Word: examine -> O
Word: the -> O
Word: word -> O
Word: in -> O
Word: context. -> O


In [None]:
text = "NLP text preprocessing prepares raw text for analysis by transforming it into a format that machines can more easily understand. It begins with tokenization, which involves splitting the text into smaller units like words, sentences or phrases."

inference(text)

Predictions for new sentence:
Word: NLP -> B
Word: text -> I
Word: preprocessing -> I
Word: prepares -> I
Word: raw -> I
Word: text -> I
Word: for -> O
Word: analysis -> O
Word: by -> O
Word: transforming -> O
Word: it -> O
Word: into -> O
Word: a -> O
Word: format -> O
Word: that -> O
Word: machines -> O
Word: can -> O
Word: more -> O
Word: easily -> O
Word: understand. -> O
Word: It -> O
Word: begins -> O
Word: with -> O
Word: tokenization, -> O
Word: which -> O
Word: involves -> O
Word: splitting -> O
Word: the -> O
Word: text -> O
Word: into -> O
Word: smaller -> O
Word: units -> O
Word: like -> O
Word: words, -> O
Word: sentences -> O
Word: or -> O
Word: phrases. -> O
