In [1]:
# 1. Data Loading and Preprocessing
def load_data_from_folder(folder_path):
    sentences = []
    sentence = []
    
    # Read all .txt files in the folder
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():  # Skip empty lines
                        # Split by tabs
                        parts = line.strip().split('\t')
                        if len(parts) == 4:
                            word, pos, ner, cls = parts
                            sentence.append([word, pos, ner, cls])
                    else:
                        if sentence:
                            sentences.append(sentence)
                            sentence = []
    return sentences

# 2. Feature Engineering
def create_features(word, pos, is_first, is_last):
    return {
        'word': word,
        'pos': pos,
        'is_first': is_first,
        'is_last': is_last,
        'is_capitalized': word[0].isupper(),
        'word_length': len(word),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
    }

In [2]:
!pip install sklearn-crfsuite




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Using CRF (Conditional Random Fields) which works well for sequence labeling
from sklearn_crfsuite import CRF

# Initialize CRF model
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [4]:
import os
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
import numpy as np

def prepare_data(sentences):
    X = []
    y = []
    
    for sentence in sentences:
        sentence_features = []
        sentence_labels = []
        
        for i, (word, pos, ner, _) in enumerate(sentence):
            features = create_features(
                word=word,
                pos=pos,
                is_first=(i == 0),
                is_last=(i == len(sentence)-1)
            )
            sentence_features.append(features)
            sentence_labels.append(ner)
            
        X.append(sentence_features)
        y.append(sentence_labels)
    
    return X, y

# Load data
train_sentences = load_data_from_folder('train')
test_sentences = load_data_from_folder('test')
eval_sentences = load_data_from_folder('eval')

# Prepare data
X_train, y_train = prepare_data(train_sentences)
X_test, y_test = prepare_data(test_sentences)

# Train model
crf.fit(X_train, y_train)

# Predict
y_pred = crf.predict(X_test)

# Evaluate
f1_macro = flat_f1_score(y_test, y_pred, average='macro')
print(f'F1-macro score: {f1_macro:.4f}')

F1-macro score: nan


In [7]:
train_sentences

[]

In [6]:
y_pred

array([], dtype=object)