In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import scipy
import numpy as np
import pandas as pd
import os,sys,inspect
import re

In [15]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary
import skseq.sequences.structured_perceptron as spc
import skseq.sequences.extended_feature as exfc

from utils.utils import build_sequence_list, evaluate_corpus

In [9]:
df_train = pd.read_csv('data/train_data_ner.csv')
tag_dict = LabelDictionary(label_names=set(df_train.tags))

# Generate dictionary of words for training dataset
word_dict_train = LabelDictionary(label_names=set(df_train.words))

# Build sequence list
train_seq = build_sequence_list(df_train[df_train.sentence_id.isin(range(20))], word_dict_train, tag_dict)

# Define features
# feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)

feature_mapper = exfc.ExtendedFeatures(train_seq) 
feature_mapper.build_features()

inv_feature_dict = {word: pos for pos, word in feature_mapper.feature_dict.items()}

In [10]:
id_seq = 6
print(train_seq[id_seq])
print(train_seq[id_seq].to_words(train_seq))

5992/4 23033/4 31839/4 27200/4 1785/4 2978/4 25132/4 
Some/O 1,27,000/O people/O are/O known/O dead/O ./O 


In [8]:
# without extendend features
feature_mapper.get_sequence_features(train_seq[id_seq])

([[0]],
 [[3], [3], [3], [3], [3], [3]],
 [[28]],
 [[132], [133], [106], [47], [134], [135], [27]])

In [11]:
# with extended features
feature_mapper.get_sequence_features(train_seq[id_seq])

([[0]],
 [[4], [4], [4], [4], [4], [4]],
 [[31]],
 [[146, 2], [147], [120], [52], [148], [149], [30]])

In [12]:
inv_feature_dict[2]

'init_caps::O'

Other features are identified by starting with **uppercased**, **suffix**, **preffix** etc...

- **uppercased:** when they contain the current word with an uppercased letter
    - Example: **``uppercased::noun``** is a feature stating that current word is uppercased and the current tag is a noun.

- **prefix:** when the current word contains a certain prefix.
    - Example: prefix:Eli::noun

- **suffix:** when the current word contains a certain suffix.
    - Example: suffix:ing:verb
    
| Conditions to be met for some of the most typical POS features     |    Name      |
| ----------------                                | ----------------    |
| $x_i=w_j ,\,\,  w_j \text{ is uppercased } ,\,\,  y_i=c_k$                 | Upper case features|
| $x_i=w_j ,\,\,  w_j \text{ contains digit} ,\,\,  y_i=c_k$                 | Digit features|
| $x_i=w_j ,\,\,  w_j \text{ contains hyphen} ,\,\,  y_i=c_k$                 | Hyphen features|
| $x_i=w_j ,\,\,  w_j[0:i] \in P_{set}  \forall i \in \{1,2,3\}  ,\,\,  y_i=c_k$                 | Prefix features|
| $x_i=w_j ,\,\,  w_j[-i] \in S_{set}  \forall i \in \{1,2,3\}  ,\,\,  y_i=c_k$                 | Suffix features|

In [34]:
# Geo: country
# Tim: time (day, month, year, recent, today...)
# Org: organization (US, United states, al-Qaida)
# Per: Person (Mr, Ms, prime minister, alba)
# Art: facebook, english, ... ??
# GPE: nationality
# EVE: event (olympic games, ramadan...)
# NAT: natural event (ebola, hurricane, aids, katrina

In [None]:
df_train.loc[df_train.tags == 'B-tim','words'].value_counts().head(150).tail(50)

<b> HMM-like emission features </b>
* X HMM-like emission

<b> Small features </b>
* X Initialism
* X Is digit
* X 1 digit
* X 2 digits
* X 4 digits
* X digits + s
* X digits + st
* X digits + th
* X Is floating point
* X Word contains dot
* X Word contains hypend
* X Word contains apostrophe
* X First letter is uppercase
* X All letters are uppercase
* X Mixed case.
* X word length (1-10,10-15,>15)
* X Word between quotes
* Change y to lowercase once uppercase info is stored in a variable?

<b> Local knowledge </b>
* wi, wi+1, wi-1
* more extensive local kwnoledge for capitalized words ??
* suffixes 
* prefixes
* lemmatizers
* stemmers

<b> POS tags </b>

<b> Words clustering </b>
* with word2vec

<b> Gazetteer </b>
* wikipedia articles


In [None]:
def add_emission_features(self, sequence, pos, y, features):
    x = sequence.x[pos]
    
    # Words surrounding x
    #if pos != 0:
    #    x_pre = sequence.x[pos-1]
    #if pos != len(sequence):
    #    x_post = sequence.x[pos+1]
        
    # Get tag name from ID.
    y_name = self.dataset.y_dict.get_label_name(y)
    # Get word name from ID.
    if isinstance(x, str):
        x_name = x
    else:
        x_name = self.dataset.x_dict.get_label_name(x)
        #x_pre = self.dataset.x_dict.get_label_name(x_pre)
        #x_post = self.dataset.x_dict.get_label_name(x_post)
        
    word = str(x_name)
    
    # HMM-LIKE EMISSION FEATURES
    feat_name = f"id:{word}::{y_name}" # Generate feature name.
    feat_id = self.add_feature(feat_name) # Get feature ID from name.
    # Append feature.
    if feat_id != -1:
        features.append(feat_id)
    
    # WORD STRUCTURE FEATURES
    # Feature: first letter is capitalized
    if word.istitle():
        feat_name = f"uppercased:first::{y_name}"
        feat_id = self.add_feature(feat_name) 

        if feat_id != -1:
            features.append(feat_id)

    # Feature: all letters are uppercase
    if word.isupper():
        feat_name = f"uppercased:all::{y_name}"
        feat_id = self.add_feature(feat_name)

        if feat_id != -1:
            features.append(feat_id)

    # Feature: is digit
    if str.isdigit(word):
        feat_name = f"digit::{y_name}"
        feat_id = self.add_feature(feat_name)

        if feat_id != -1:
            features.append(feat_id)
        
        # Features: digits of length 1, 2 or 4 
        for i in [1, 2, 4]:
            if len(word) == i:
                feat_name = f"digit:{str(i)}::{y_name}"
                feat_id = self.add_feature(feat_name)
                if feat_id != -1:
                    features.append(feat_id)                  
    
    # Feature: is floating point number
    try:
        float(word)

        feat_name = f"float::{y_name}"
        feat_id = self.add_feature(feat_name)

        if feat_id != -1:
            features.append(feat_id)
    except ValueError:
        pass


    # Feature: word contains dot
    if len(word) > 1 and '.' in word:
        feat_name = f"has_dot::{y_name}"
        feat_id = self.add_feature(feat_name)

        if feat_id != -1:
            features.append(feat_id)

    # Features: word length
    for i in range(1, 11):
        if len(word) == i:
            feat_name = f"length:{str(i)}::{y_name}"
            feat_id = self.add_feature(feat_name)

            if feat_id != -1:
                features.append(feat_id)
                
    if len(word) > 10 & len(word) <= 15:
        feat_name = f"length:10-15::{y_name}"
        feat_id = self.add_feature(feat_name)
        if feat_id != -1:
            features.append(feat_id)
            
    if len(word) > 15:
        feat_name = f"length:gt15::{y_name}"
        feat_id = self.add_feature(feat_name)
        if feat_id != -1:
            features.append(feat_id) 
            
    # Features: detect several patterns
    patterns = [r"(\w\.){2,}", r"\d+s", r"\d+st", r"\d+nd", r"\d+rd", r"\d+th", r"\-", r"(^\'\w+\'$)|(^\"\w+\"$)", "([A-Z]+[a-z]+)|([a-z]+[A-Z]+)", r"\'"]
    labels = ["initialism","digit:s","digit:st","digit:nd","digit:rd","digit:th","hypend","quote","uppercased:mixed","apostrophe"]
    for pattern, label in zip(patterns, labels):
        regx = re.compile(pattern)

        if bool(regx.match(word)):
            print(f"{label}::{y_name}")
            
    # CONTEXTUAL FEATURES        

    return features


* wi, wi+1, wi-1
* more extensive local kwnoledge for capitalized words ??
* suffixes 
* prefixes
* lemmatizers
* stemmers    