In [3]:
import nltk
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [26]:
import gensim

# Read data

In [7]:
def read_file(file_name):
    with open(file_name) as f:
        data = []
        data_temp = []
        labels = []
        labels_temp = []

        for line in f.read().splitlines():
            if line != '':  
                a = line.split('\t')
                data_temp.append(a[0])
                labels_temp.append(a[1])
            else:
                data.append(data_temp)
                labels.append(labels_temp)
                data_temp = []
                labels_temp = []

    f.close()
    return data, labels

In [8]:
def read_no_labels(test_file_name):
    with open(test_file_name) as f:
        data = []
        data_temp = []

        for line in f.read().splitlines():
            if line != '':  
                data_temp.append(line)
            else:
                data.append(data_temp)
                data_temp = []
    f.close()
    return data

In [32]:
train_data, train_labels = read_file('data/train/train.txt')
dev_data, dev_labels = read_file('data/dev/dev.txt')

In [33]:
train_data[0][0]

'@paulwalk'

In [34]:
test_data = read_no_labels('data/test/test.nolabels.txt')

# Feature Engineering

## POS tagging

In [31]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

In [60]:
def get_word_vec(word):
    try:
        return model[word]
    except KeyError:
        return ['NULL']*300

In [36]:
train_data = [nltk.pos_tag(train_data[i]) for i in range(len(train_data))]
dev_data = [nltk.pos_tag(dev_data[i]) for i in range(len(dev_data))]
test_data = [nltk.pos_tag(test_data[i]) for i in range(len(test_data))]

## Word Feature

In [71]:
def Word2Features(sentence, pos):
    features = {}
    features.update(current_word_features(sentence[pos][0]))
    features.update(w2vfeatuers(sentence[pos][0]))
    if pos > 0:
        features.update(prev_word_features(sentence[pos-1][0]))
    else:
        features.update(begin_of_sentence())
    
    if pos < len(sentence)-1:
        features.update(next_word_features(sentence[pos+1][0]))
    else:
        features.update(end_of_sentence())
        
    features.update(tag_features(sentence,pos))
    return features
        
def current_word_features(word):
    return {
#         'bias': 1.0,
        'lower': word.lower(),
        'suffix_4': word[-4:],
        'suffix_3': word[-3:],
        'suffix_2': word[-2:],
        'isupper': word.isupper(),
        'istitle': word.istitle(),
        'isdigit': word.isdigit(),
    }

def w2vfeatuers(word):
    w2vfeatures = {}
    for index, letter in enumerate(get_word_vec(word)):
        w2vfeatures.update({'wv_value'+str(index): letter})
    return w2vfeatures

def prev_word_features(word):
    return {
        'prev_lower': word.lower(),
        'prev_istitle': word.istitle(),
        'prev_isupper': word.isupper(),
    }

def next_word_features(word):
    return {
        'next_lower': word.lower(),
        'next_istitle': word.istitle(),
        'next_isupper': word.isupper(),
    }

def tag_features(sentence, pos):
    pos_features = {'pos[0]': sentence[pos][1]}
    prev_pos_tag = sentence[pos-1][1] if pos > 0 else 'NULL'
    next_pos_tag = sentence[pos+1][1] if pos < len(sentence)-1 else 'NULL'
    pos_features.update({
        'pos[-1]|pos[0]': prev_pos_tag + '|' + sentence[pos][1],
        'pos[0]|pos[+1]': sentence[pos][1] + '|' + next_pos_tag,
        'pos[-1]|pos[0]|pos[+1]': prev_pos_tag + '|' + sentence[pos][1] + '|' + next_pos_tag,
        
    })
    return pos_features

def begin_of_sentence():
    return {'BOS': True}

def end_of_sentence():
    return {'EOS': True}

In [72]:
X_train = [[Word2Features(s, pos) for pos in range(len(s))] for s in train_data]
y_train = train_labels

X_dev = [[Word2Features(s, pos) for pos in range(len(s))] for s in dev_data]
y_dev = dev_labels

In [73]:
X_train[0][0]

{'BOS': True,
 'isdigit': False,
 'istitle': False,
 'isupper': False,
 'lower': '@paulwalk',
 'next_istitle': True,
 'next_isupper': False,
 'next_lower': 'it',
 'pos[-1]|pos[0]': 'NULL|VB',
 'pos[-1]|pos[0]|pos[+1]': 'NULL|VB|PRP',
 'pos[0]': 'VB',
 'pos[0]|pos[+1]': 'VB|PRP',
 'suffix_2': 'lk',
 'suffix_3': 'alk',
 'suffix_4': 'walk',
 'wv_value0': 'NULL',
 'wv_value1': 'NULL',
 'wv_value10': 'NULL',
 'wv_value100': 'NULL',
 'wv_value101': 'NULL',
 'wv_value102': 'NULL',
 'wv_value103': 'NULL',
 'wv_value104': 'NULL',
 'wv_value105': 'NULL',
 'wv_value106': 'NULL',
 'wv_value107': 'NULL',
 'wv_value108': 'NULL',
 'wv_value109': 'NULL',
 'wv_value11': 'NULL',
 'wv_value110': 'NULL',
 'wv_value111': 'NULL',
 'wv_value112': 'NULL',
 'wv_value113': 'NULL',
 'wv_value114': 'NULL',
 'wv_value115': 'NULL',
 'wv_value116': 'NULL',
 'wv_value117': 'NULL',
 'wv_value118': 'NULL',
 'wv_value119': 'NULL',
 'wv_value12': 'NULL',
 'wv_value120': 'NULL',
 'wv_value121': 'NULL',
 'wv_value122': 'NU

In [74]:
X_test = [[Word2Features(s, pos) for pos in range(len(s))] for s in test_data]

In [75]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 34.8 s, sys: 2.05 s, total: 36.9 s
Wall time: 38 s


In [76]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B', 'I']

In [77]:
y_train_pred = crf.predict(X_train)

In [78]:
y_dev_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_dev_pred, 
                      average='weighted', labels=labels)

0.58742668427362776

In [79]:
%%time
y_test_pred = crf.predict(X_test)

CPU times: user 3.9 s, sys: 1.93 s, total: 5.83 s
Wall time: 7.38 s


In [81]:
f = open("output-train",'w')
for label_sentence in y_train_pred:
    for label_word in label_sentence:
        f.write(label_word + '\n')
    f.write('\n')
f.close()