In [29]:
def prefix_extract(word):
    if not word:
        return None
    prefixes = ["anti", "dis", "extra", "inter", "pre", "re", "sub", "un", "in", "im", "ir", "il", "over", "under", "trans", "mis", "non", "co", "com", "con", "de", "auto", "bio", "geo", "psycho"]
    
    for prefix in prefixes:
        if word.startswith(prefix):
            return prefix
    return '@@'  

def suffix_extract(word):
    if not word:
        return None
    suffixes = ["able", "ible", "ation", "ment", "ness", "ity", "ty", "ly", "ing", "ed", "ize", "ise", "ful", "less", "ous", "ive", "al", "er", "or", "ism", "ist", "ship", "hood", "th", "en", "ify", "ward", "wise"]
    
    for suffix in suffixes:
        if word.endswith(suffix):
            return suffix
    return '@@'

In [30]:
file_path = '../data/WSJ_02-21.pos-chunk'  

sentences = []
sentence = []

with open(file_path, 'r') as f:
    for line in f:
        if line.strip():  
            word = line.strip().split('\t')
            sentence.append(word)
        else:  
            if sentence:  
                sentences.append(sentence)
                sentence = []

# Don't forget to add the last sentence if the file doesn't end with a newline
if sentence:
    sentences.append(sentence)


import nltk
stemmer = nltk.stem.SnowballStemmer('english')

def stem(word):
    return stemmer.stem(word)

def is_capitalized(word):
    return word[0].isupper()

def extract_features(sentences):
    features = []
    for sentence in sentences:
        for i, token in enumerate(sentence):
            word, pos, bio_tag = token
            feature = {
                'WORD': word,
                'STEM': stem(word),
                'POS': pos,
                'LENGTH': len(word),
                'PREFIX': prefix_extract(word),
                'SUFFIX': suffix_extract(word),
                'POSITION': round(i / len(sentence), 2),
                'CAPITALIZED': is_capitalized(word),
                'PREVIOUS_TAG': sentence[i-1][2] if i > 0 else "@@",
                'PREVIOUS_POS': sentence[i-1][1] if i > 0 else "@@",
                'PREVIOUS_WORD': sentence[i-1][0] if i > 0 else "@@",
                'PREVIOUS_STEM': stem(sentence[i-1][0]) if i > 0 else "@@",
                'PREVIOUS_2_POS': sentence[i-2][1] if i > 1 else "@@",
                'PREVIOUS_2_WORD': sentence[i-2][0] if i > 1 else "@@",
                'PREVIOUS_2_STEM': stem(sentence[i-2][0]) if i > 1 else "@@",
                'PREVIOUS_3_STEM': stem(sentence[i-3][0]) if i > 2 else "@@",
                'NEXT_TAG': sentence[i+1][2] if i < len(sentence)-1 else "@@",
                'NEXT_POS': sentence[i+1][1] if i < len(sentence)-1 else "@@",
                'NEXT_WORD': sentence[i+1][0] if i < len(sentence)-1 else "@@",
                'NEXT_STEM': stem(sentence[i+1][0]) if i < len(sentence)-1 else "@@",
                'NEXT_2_POS': sentence[i+2][1] if i < len(sentence)-2 else "@@",
                'NEXT_2_WORD': sentence[i+2][0] if i < len(sentence)-2 else "@@",
                'NEXT_2_STEM': stem(sentence[i+2][0]) if i < len(sentence)-2 else "@@",
            }
            features.append((feature, bio_tag))
        # mark for new line
        features.append(('NEWLINE', None))
    return features

features = extract_features(sentences)

with open(r'..\bin\a.features', 'w') as f:
    f.write("\n")
    for feature, bio_tag in features:
        if feature == 'NEWLINE':
            f.write('\n')
            continue
        for key, value in feature.items():
            if key == 'WORD':
                f.write(f"{feature['WORD']}\t")
            else:
                f.write(f"{key}={value}\t")
        f.write(f"{bio_tag}\n") 

In [31]:
# file_path = '../data/WSJ_23.pos'  
file_path = '../data/WSJ_24.pos'

sentences = []
sentence = []

with open(file_path, 'r') as f:
    for line in f:
        if line.strip():  
            word = line.strip().split('\t')
            sentence.append(word)
        else:  
            if sentence:  
                sentences.append(sentence)
                sentence = []

# Don't forget to add the last sentence if the file doesn't end with a newline
if sentence:
    sentences.append(sentence)

def stem(word):
    return stemmer.stem(word)

def is_capitalized(word):
    return word[0].isupper()

def extract_features(sentences):
    features = []
    for sentence in sentences:
        for i, token in enumerate(sentence):
            word, pos = token
            feature = {
                'WORD': word,
                'STEM': stem(word),
                'POS': pos,
                'LENGTH': len(word),
                'PREFIX': prefix_extract(word),
                'SUFFIX': suffix_extract(word),
                'POSITION': round(i / len(sentence), 2),
                'CAPITALIZED': is_capitalized(word),
                'PREVIOUS_POS': sentence[i-1][1] if i > 0 else "@@",
                'PREVIOUS_WORD': sentence[i-1][0] if i > 0 else "@@",
                'PREVIOUS_STEM': stem(sentence[i-1][0]) if i > 0 else "@@",
                'PREVIOUS_2_POS': sentence[i-2][1] if i > 1 else "@@",
                'PREVIOUS_2_WORD': sentence[i-2][0] if i > 1 else "@@",
                'PREVIOUS_2_STEM': stem(sentence[i-2][0]) if i > 1 else "@@",
                'NEXT_POS': sentence[i+1][1] if i < len(sentence)-1 else "@@",
                'NEXT_WORD': sentence[i+1][0] if i < len(sentence)-1 else "@@",
                'NEXT_STEM': stem(sentence[i+1][0]) if i < len(sentence)-1 else "@@",
                'NEXT_2_POS': sentence[i+2][1] if i < len(sentence)-2 else "@@",
                'NEXT_2_WORD': sentence[i+2][0] if i < len(sentence)-2 else "@@",
                'NEXT_2_STEM': stem(sentence[i+2][0]) if i < len(sentence)-2 else "@@",
            }
            features.append(feature)
        # mark for new line
        features.append('NEWLINE')
    return features

features = extract_features(sentences)

with open(r'..\bin\b.features', 'w') as f:
    f.write("\n")
    for feature in features:
        if feature == 'NEWLINE':
            f.write('\n')
            continue
        for key, value in feature.items():
            if key == 'WORD':
                f.write(f"{feature['WORD']}\t")
            else:
                f.write(f"{key}={value}\t")
        f.write("\n") 