In [8]:
import pandas as pd

In [9]:
train_df = pd.read_csv("../data/cleaned_data/cleaned_data_train.csv")
test_df = pd.read_csv("../data/cleaned_data/cleaned_data_test.csv")
dev_df = pd.read_csv("../data/cleaned_data/cleaned_data_dev.csv")

In [10]:
# train_df = train_df.drop(columns=["token_length"])
# test_df = test_df.drop(columns=["token_length"])
# dev_df = dev_df.drop(columns=["token_length"])

In [11]:
train_df

Unnamed: 0,token,label
0,In,O
1,this,O
2,article,O
3,we,O
4,discuss,O
...,...,...
26737,of,O
26738,intricate,B
26739,phonological,I
26740,phenomena,I


In [12]:
train_df

Unnamed: 0,token,label
0,In,O
1,this,O
2,article,O
3,we,O
4,discuss,O
...,...,...
26737,of,O
26738,intricate,B
26739,phonological,I
26740,phenomena,I


In [13]:
# Generate sentence numbers based on periods (.) to define sentence boundaries
def sentence_numbering(df):
    sentence_numbers = []
    sentence_id = 1
    for word in df['token']:
        sentence_numbers.append(f"Sentence: {sentence_id}")
        if word == '.':  # Consider '.' as the sentence delimiter
            sentence_id += 1
    return sentence_numbers

# Generate sentence numbers for train, test, and dev datasets
train_df['sentence_id'] = sentence_numbering(train_df)
test_df['sentence_id'] = sentence_numbering(test_df)
dev_df['sentence_id'] = sentence_numbering(dev_df)

In [14]:
train_df

Unnamed: 0,token,label,sentence_id
0,In,O,Sentence: 1
1,this,O,Sentence: 1
2,article,O,Sentence: 1
3,we,O,Sentence: 1
4,discuss,O,Sentence: 1
...,...,...,...
26737,of,O,Sentence: 923
26738,intricate,B,Sentence: 923
26739,phonological,I,Sentence: 923
26740,phenomena,I,Sentence: 923


### Part-of-Speech

In [15]:
import nltk
from nltk import pos_tag, word_tokenize

In [16]:
def get_pos_tag(sentence):
    tokens = word_tokenize(sentence)
    # pos_tags = nltk.pos_tag(tokens)
    list_words = []
    for word, tag in pos_tags:
        list_words.append({'words': word })
    return list_words

In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
train_df["sentence_id"] = LabelEncoder().fit_transform(train_df["sentence_id"])
test_df["sentence_id"] = LabelEncoder().fit_transform(test_df["sentence_id"])
dev_df["sentence_id"] = LabelEncoder().fit_transform(dev_df["sentence_id"])

In [19]:
train_df.head(30)

Unnamed: 0,token,label,sentence_id
0,In,O,0
1,this,O,0
2,article,O,0
3,we,O,0
4,discuss,O,0
5,several,O,0
6,metrics,B,0
7,of,I,0
8,coherence,I,0
9,defined,O,0


In [20]:
train_df.columns

Index(['token', 'label', 'sentence_id'], dtype='object')

In [21]:
X= train_df[["sentence_id","token"]]
Y =train_df["label"]

In [22]:
#building up train data and test data
# train_data = pd.DataFrame({"sentence_id":train_df["sentence_id"],"words":train_df["token"],"pos_tag": train_df["pos_tag"] ,"label":train_df["label"]})
# test_data = pd.DataFrame({"sentence_id":test_df["sentence_id"],"words":test_df["token"],"pos_tag": test_df["pos_tag"] ,"label":test_df["label"]})
# dev_data = pd.DataFrame({"sentence_id":dev_df["sentence_id"],"words":dev_df["token"],"pos_tag": dev_df["pos_tag"] ,"label":dev_df["label"]})

In [23]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":train_df["sentence_id"],"words":train_df["token"],"label":train_df["label"]})
test_data = pd.DataFrame({"sentence_id":test_df["sentence_id"],"words":test_df["token"] ,"label":test_df["label"]})
dev_data = pd.DataFrame({"sentence_id":dev_df["sentence_id"],"words":dev_df["token"] ,"label":dev_df["label"]})

In [24]:
train_data.head()

Unnamed: 0,sentence_id,words,label
0,0,In,O
1,0,this,O
2,0,article,O
3,0,we,O
4,0,discuss,O


### Prepare data for training

In [25]:
# Group data by sentence ID
grouped_train_data = train_data.groupby('sentence_id')
grouped_test_data = test_data.groupby('sentence_id')
grouped_dev_data = dev_data.groupby('sentence_id')  

In [26]:
# Convert the data into a list of sentences
train_setence = []
train_label = []
test_sentence = []
test_label = []
dev_sentence = []
dev_label = []

for _, group in grouped_train_data:
    sentence = group[['words']].to_dict('records')
    label = group['label'].tolist()
    train_setence.append(sentence)
    train_label.append(label)

for _, group in grouped_test_data:
    sentence = group[['words']].to_dict('records')
    label = group['label'].tolist()
    test_sentence.append(sentence)
    test_label.append(label)

for _, group in grouped_dev_data:
    sentence = group[['words']].to_dict('records')
    label = group['label'].tolist()
    dev_sentence.append(sentence)
    dev_label.append(label)

In [27]:
# Feature extraction function
def word2features(sentence, i):
    word = sentence[i]['words']
    # postag = sentence[i]['pos_tag']

    features = {
        'word.lower()': word.lower(),
        # 'word[-3:]': word[-3:],  # Last 3 characters
        # 'word[-2:]': word[-2:],  # Last 2 characters
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        # 'postag': postag,
    }

    if i > 0:
        word1 = sentence[i - 1]['words']
        # postag1 = sentence[i - 1]['pos_tag']
        features.update({
            '-1:word.lower()': word1.lower(),
            # '-1:postag': postag1,
        })
    else:
        features['BOS'] = True  # Beginning of Sentence

    if i < len(sentence) - 1:
        word1 = sentence[i + 1]['words']
        # postag1 = sentence[i + 1]['pos_tag']
        features.update({
            '+1:word.lower()': word1.lower(),
            # '+1:postag': postag1,
        })
    else:
        features['EOS'] = True  # End of Sentence

    return features

In [28]:
def sentence2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [29]:
# Extract features and labels
X_train = [sentence2features(s) for s in train_setence]
y_train = train_label

X_test = [sentence2features(s) for s in test_sentence]
y_test = test_label

X_dev = [sentence2features(s) for s in dev_sentence]
y_dev = dev_label

### Model Training

In [30]:
import sklearn_crfsuite

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

In [31]:
# Initialize CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # You can also try 'saga' for larger datasets lbfgs
    c1=0.2,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,  # Maximum number of iterations
    all_possible_transitions=True,
)

In [32]:
# Train the model
crf.fit(X_train, y_train)

In [33]:
# Predict on test data
y_pred = crf.predict(X_test)

# Calculate and display metrics
labels = list(crf.classes_)
labels.remove('O')  # Remove 'O' from evaluation
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

              precision    recall  f1-score   support

           B       0.69      0.52      0.59       445
           I       0.76      0.69      0.73       482

   micro avg       0.73      0.61      0.66       927
   macro avg       0.73      0.60      0.66       927
weighted avg       0.73      0.61      0.66       927



In [34]:
# save model using pickle file
import pickle

filename = '../models/trained_models/final_crf_model.sav'

pickle.dump(crf, open(filename, 'wb'))

### Model Fine-tuning

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# Define parameter space
params_space = {
    'c1': [0.1, 0.2, 0.5, 1.0],
    'c2': [0.1, 0.2, 0.5, 1.0],
}

# Use RandomizedSearchCV for hyperparameter optimization
rs = RandomizedSearchCV(
    estimator=crf,
    param_distributions=params_space,
    cv=3,
    verbose=1,
    n_iter=10,
    scoring=make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
)

# Perform search
rs.fit(X_train, y_train)
print("Best hyperparameters:", rs.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best hyperparameters: {'c2': 0.2, 'c1': 0.2}


In [36]:
# evaluate rs model

# Predict on test data
y_pred = rs.predict(X_test)

# Calculate and display metrics
labels = list(rs.classes_)
labels.remove('O')  # Remove 'O' from evaluation
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

              precision    recall  f1-score   support

           B       0.68      0.48      0.57       445
           I       0.77      0.67      0.72       482

   micro avg       0.73      0.58      0.65       927
   macro avg       0.73      0.58      0.64       927
weighted avg       0.73      0.58      0.64       927



In [37]:
def inference(model, sentence):
    sentence = get_pos_tag(sentence)
    sentence_features = sentence2features(sentence)
    result =  model.predict([sentence_features])[0]
    return list(zip([s['words'] for s in sentence], result))

In [38]:
# test inference function
sentence = "NLP stands for Natural Language Processing, a machine learning technology that allows computers to understand, interpret, and manipulate human language. NLP is a branch of artificial intelligence (AI) that combines computational linguistics, statistical modeling, machine learning, and deep learning."
inference(crf, sentence)

NameError: name 'pos_tags' is not defined

In [61]:
text = "Natural language processing (NLP) techniques, or NLP tasks, break down human text or speech into smaller parts that computer programs can easily understand. Common text processing and analyzing capabilities in NLP are given below."
inference(crf, text)

[('Natural', 'B'),
 ('language', 'I'),
 ('processing', 'I'),
 ('(', 'O'),
 ('NLP', 'B'),
 (')', 'O'),
 ('techniques', 'O'),
 (',', 'O'),
 ('or', 'O'),
 ('NLP', 'B'),
 ('tasks', 'I'),
 (',', 'O'),
 ('break', 'O'),
 ('down', 'O'),
 ('human', 'O'),
 ('text', 'O'),
 ('or', 'O'),
 ('speech', 'B'),
 ('into', 'O'),
 ('smaller', 'O'),
 ('parts', 'O'),
 ('that', 'O'),
 ('computer', 'O'),
 ('programs', 'O'),
 ('can', 'O'),
 ('easily', 'O'),
 ('understand', 'O'),
 ('.', 'O'),
 ('Common', 'B'),
 ('text', 'I'),
 ('processing', 'I'),
 ('and', 'O'),
 ('analyzing', 'O'),
 ('capabilities', 'O'),
 ('in', 'O'),
 ('NLP', 'B'),
 ('are', 'O'),
 ('given', 'O'),
 ('below', 'O'),
 ('.', 'O')]