In [1]:
import nltk
import pandas as pd

In [85]:
train_df = pd.read_csv("../data/cleaned_data/cleaned_data_train.csv")
dev_df = pd.read_csv("../data/cleaned_data/cleaned_data_dev.csv")
test_df = pd.read_csv("../data/cleaned_data/cleaned_data_test.csv")

In [3]:
# combine train_df and dev_df
df = pd.concat([train_df, dev_df])

In [53]:
# load FastText model
from gensim.models import FastText

model = FastText.load("../models/trained_models/fasttext_model_embedding.h5")

len(model.wv['NLP'])

300

In [76]:
# write a function to get a POS tag
def get_pos_tag(tokens):
    pos_tags = nltk.pos_tag(tokens)
    list_pos_tags = [pos[1] for pos in pos_tags]
    return list_pos_tags

def construct_sentence(tokens):
    # Initialize variables for storing sequences
    sentences_labels = []
    sentence = []
    label_seq = []

    for _, row in tokens.iterrows():
        # Check for sentence end
        if row['token'] != '.':  # Replace '.' with your sentence delimiter, if any
            sentence.append(row['token'])
            label_seq.append(row['label'])
        else:
            # Add the completed sentence and labels to the lists
            sentences_labels.append({'sentence': sentence, 'label': label_seq})
            sentence = []
            label_seq = []
    
    return sentences_labels

def extract_features(tokens):
    """
    Extract features for each token in a sequence for NER tasks.
    
    Args:
        tokens (list of str): A list of tokens (words) in a sentence or sequence.
    
    Returns:
        list of dict: A list where each dictionary contains features for a token.
    """
    features = []
    try:
        pos_tags = get_pos_tag(tokens)  # Assume get_pos_tag() returns a list of POS tags for tokens.
    except Exception as e:
        raise ValueError(f"Error in POS tagging: {e}")

    for i, token in enumerate(tokens):
        # Helper to ensure no key errors occur
        def get_safe(lst, idx, default=None):
            return lst[idx] if 0 <= idx < len(lst) else default

        # Initialize features for the current token
        token_features = {
            'word': token,  # The actual token
            'lowercase': token.lower(),  # Lowercase form
            'is_capitalized': token[0].isupper() if token else False,  # Capitalization
            'word_shape': ''.join(['X' if c.isupper() else 'x' if c.islower() else 'd' if c.isdigit() else c for c in token]),  # Word shape
            'prefix': token[:2] if len(token) >= 2 else token,  # First 2 characters
            'suffix': token[-2:] if len(token) >= 2 else token,  # Last 2 characters
            'is_digit': token.isdigit(),  # Check if the token is numeric
            'length': len(token),  # Length of the token
            'pos_tag': get_safe(pos_tags, i, 'UNK'),  # POS tag with a fallback to 'UNK'
        }

        # Add previous token and POS tag features
        if i > 0:
            token_features.update({
                'previous_token': get_safe(tokens, i - 1, 'BOS'),  # BOS if out of range
                'previous_pos_tag': get_safe(pos_tags, i - 1, 'BOS'),  # BOS for POS
            })
        else:
            token_features['BOS'] = True  # Beginning of Sequence

        # Add next token and POS tag features
        if i < len(tokens) - 1:
            token_features.update({
                'next_token': get_safe(tokens, i + 1, 'EOS'),  # EOS if out of range
                'next_pos_tag': get_safe(pos_tags, i + 1, 'EOS'),  # EOS for POS
            })
        else:
            token_features['EOS'] = True  # End of Sequence

        # Add domain-specific features (customize as needed)
        token_features.update({
            'is_alphanumeric': token.isalnum(),  # Check if the token is alphanumeric
            'contains_hyphen': '-' in token,  # Does the token have a hyphen?
            'contains_digit': any(c.isdigit() for c in token),  # Does the token contain digits?
        })

        # add feature embedding
        # try:
        #     token_features['embedding'] = model.wv[token]
        # except:
        #     token_features['embedding'] = [0]*300

        features.append(token_features)

    return features

In [91]:
train_sentences = construct_sentence(df)
for sentence in train_sentences:
    sentence['features'] = extract_features(sentence['sentence'])

# Extract features and labels from the dataset
X_train = [sentence['features'] for sentence in train_sentences]
y_train = [sentence['label'] for sentence in train_sentences]

In [92]:
len(X_train), len(y_train)

(1037, 1037)

In [93]:
test_sentences = construct_sentence(test_df)
for sentence in test_sentences:
    sentence['features'] = extract_features(sentence['sentence'])

# Extract features and labels from the dataset
X_test = [sentence['features'] for sentence in test_sentences]
y_test = [sentence['label'] for sentence in test_sentences]

In [94]:
len(X_test), len(y_test)

(135, 135)

In [84]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [95]:
# Initialize the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

In [96]:
# Ensure the data is in the correct format
assert all(isinstance(sentence, list) for sentence in X_train), "X_train should be a list of lists of feature dictionaries"
assert all(isinstance(label, list) for label in y_train), "y_train should be a list of lists of labels"

# Train the model
crf.fit(X_train, y_train)


In [97]:
# Predict on test data
y_pred = crf.predict(X_test)

# Calculate and display metrics
labels = list(crf.classes_)
labels.remove('O')  # Remove 'O' from evaluation
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

              precision    recall  f1-score   support

           B       0.70      0.63      0.66       445
           I       0.74      0.69      0.71       482

   micro avg       0.72      0.66      0.69       927
   macro avg       0.72      0.66      0.69       927
weighted avg       0.72      0.66      0.69       927



In [100]:
sentence = "NLP powers advanced language models to create human-like text for various purposes. Pre-trained models, such as GPT-4, can generate articles, reports, marketing copy, product descriptions and even creative writing based on prompts provided by users."

In [101]:
# test with unseen data related NLP

tokens = nltk.word_tokenize(sentence)

features = extract_features(tokens)

# Make predictions
y_pred = crf.predict([features])

# Display the results
for token, label in zip(tokens, y_pred[0]):
    print(f"{token}\t{label}")


NLP	B
powers	O
advanced	O
language	B
models	I
to	O
create	O
human-like	O
text	O
for	O
various	O
purposes	O
.	O
Pre-trained	B
models	I
,	O
such	O
as	O
GPT-4	O
,	O
can	O
generate	O
articles	O
,	O
reports	O
,	O
marketing	O
copy	O
,	O
product	O
descriptions	O
and	O
even	O
creative	O
writing	O
based	O
on	O
prompts	O
provided	O
by	O
users	O
.	O


In [102]:
# save model using pickle
import pickle

with open("../models/trained_models/crf_ner_model.pkl", "wb") as file:
    pickle.dump(crf, file)
    print("Saved model to disk")

Saved model to disk
