In [1]:
import nltk
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import gensim



# Read data

### Read input data

In [2]:
def read_file(file_name):
    with open(file_name, encoding='utf-8') as f:
        data = []
        data_temp = []
        labels = []
        labels_temp = []

        for line in f.read().splitlines():
            if line != '':  
                a = line.split('\t')
                data_temp.append(a[0])
                labels_temp.append(a[1])
            else:
                data.append(data_temp)
                labels.append(labels_temp)
                data_temp = []
                labels_temp = []

    f.close()
    return data, labels

In [3]:
def read_no_labels(test_file_name):
    with open(test_file_name, encoding='utf-8') as f:
        data = []
        data_temp = []

        for line in f.read().splitlines():
            if line != '':  
                data_temp.append(line)
            else:
                data.append(data_temp)
                data_temp = []
    f.close()
    return data

In [4]:
train_data, train_labels = read_file('data/train/train.txt')
dev_data, dev_labels = read_file('data/dev/dev.txt')

In [5]:
train_data[0][0]

'@paulwalk'

In [6]:
test_data = read_no_labels('data/test/test.nolabels.txt')

## Read external sources

In [48]:
with open("data/train/eng.list", encoding='utf-8') as f:
    eng_data = []
    eng_labels = []

    for line in f.read().splitlines():
        if line != '':  
            temp = line.split(' ')
            eng_data.append(temp[1:])
            eng_labels.append(['B'] + ['I'] * (len(temp)-2))
f.close()

# Feature Engineering

## Word Embedding

In [7]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
def get_word_vec(word):
    try:
        return model[word]
    except KeyError:
        return ['NULL']*300

## POS & Chunk tagging

In [9]:
grammar= r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp=nltk.RegexpParser(grammar);

def chunck_tag(sentence):
    tree = cp.parse(sentence)
    return nltk.chunk.tree2conlltags(tree.flatten())

In [10]:
train_data = [chunck_tag(nltk.pos_tag(train_data[i])) for i in range(len(train_data))]
dev_data = [chunck_tag(nltk.pos_tag(dev_data[i])) for i in range(len(dev_data))]
test_data = [chunck_tag(nltk.pos_tag(test_data[i])) for i in range(len(test_data))]

## Word Feature

In [38]:
def Word2Features(sentence, pos):
    features = {}
    features.update(current_word_features(sentence[pos][0]))
    features.update(w2vfeatuers(sentence[pos][0]))
    if pos > 0:
        features.update(prev_word_features(sentence[pos-1][0]))
    else:
        features.update(begin_of_sentence())
    
    if pos < len(sentence)-1:
        features.update(next_word_features(sentence[pos+1][0]))
    else:
        features.update(end_of_sentence())
        
    features.update(tag_features(sentence,pos))
#     features.update(chunk_features(sentence,pos))
    return features

def w2vfeatuers(word):
    w2vfeatures = {}
    for index, letter in enumerate(get_word_vec(word)):
        w2vfeatures.update({'wv_value'+str(index): letter})
    return w2vfeatures

def current_word_features(word):
    return {
        'bias': 1.0,
        'lower': word.lower(),
        'suffix_4': word[-4:],
        'suffix_3': word[-3:],
        'suffix_2': word[-2:],
        'isupper': word.isupper(),
        'istitle': word.istitle(),
        'isdigit': word.isdigit(),
    }


def prev_word_features(word):
    return {
        'prev_lower': word.lower(),
        'prev_istitle': word.istitle(),
        'prev_isupper': word.isupper(),
    }

def next_word_features(word):
    return {
        'next_lower': word.lower(),
        'next_istitle': word.istitle(),
        'next_isupper': word.isupper(),
    }

def tag_features(sentence, pos):
    pos_features = {'pos[0]': sentence[pos][1]}
    prev_prev_pos_tag = sentence[pos-2][1] if pos > 1 else 'START'
    prev_pos_tag = sentence[pos-1][1] if pos > 0 else 'START'
    next_pos_tag = sentence[pos+1][1] if pos < len(sentence)-1 else 'END'
    next_next_pos_tag = sentence[pos+2][1] if pos < len(sentence)-2 else 'END'
    pos_features.update({
#         'pos[-2]': prev_prev_pos_tag,
#         'pos[-1]': prev_pos_tag,
#         'pos[+1]': next_pos_tag,
#         'pos[+2]': next_next_pos_tag,
        'pos[-2]|pos[-1]': prev_prev_pos_tag + '|' + prev_pos_tag,
        'pos[-1]|pos[0]': prev_pos_tag + '|' + sentence[pos][1],
        'pos[0]|pos[+1]': sentence[pos][1] + '|' + next_pos_tag,
        'pos[+1]|pos[+2]': next_pos_tag + '|' + next_next_pos_tag,
#         'pos[-2]|pos[-1]|pos[0]': prev_prev_pos_tag + '|' + prev_pos_tag + '|' + sentence[pos][1],
#         'pos[-1]|pos[0]|pos[+1]': prev_pos_tag + '|' + sentence[pos][1] + '|' + next_pos_tag,
#         'pos[0]|pos[+1]|pos[+2]': sentence[pos][1] + '|' + next_pos_tag + '|' + next_next_pos_tag,        
    })
    return pos_features

def chunk_features(sentence, pos):
    chunk_features = {'chunk[0]': sentence[pos][2]}
    prev_chunk_tag = sentence[pos-1][2] if pos > 0 else 'NULL'
    next_chunk_tag = sentence[pos+1][2] if pos < len(sentence)-1 else 'NULL'
    chunk_features.update({
        'chunk[-1]': prev_chunk_tag,
        'chunk[+1]': next_chunk_tag,
        'chunk[-1]|chunk[0]': prev_chunk_tag + '|' + sentence[pos][2],
        'chunk[0]|chunk[+1]': sentence[pos][2] + '|' + next_chunk_tag,
    })
    return chunk_features

def begin_of_sentence():
    return {'BOS': True}

def end_of_sentence():
    return {'EOS': True}

In [39]:
X_train = [[Word2Features(s, pos) for pos in range(len(s))] for s in train_data]
y_train = train_labels

X_dev = [[Word2Features(s, pos) for pos in range(len(s))] for s in dev_data]
y_dev = dev_labels

In [35]:
X_train[0][0]

{'BOS': True,
 'bias': 1.0,
 'isdigit': False,
 'istitle': False,
 'isupper': False,
 'lower': '@paulwalk',
 'next_istitle': True,
 'next_isupper': False,
 'next_lower': 'it',
 'pos[+1]': 'PRP',
 'pos[+1]|pos[+2]': 'PRP|VBZ',
 'pos[+2]': 'VBZ',
 'pos[-1]': 'START',
 'pos[-1]|pos[0]': 'START|VB',
 'pos[-2]': 'START',
 'pos[-2]|pos[-1]': 'START|START',
 'pos[0]': 'VB',
 'pos[0]|pos[+1]': 'VB|PRP',
 'wv_value0': 'NULL',
 'wv_value1': 'NULL',
 'wv_value10': 'NULL',
 'wv_value100': 'NULL',
 'wv_value101': 'NULL',
 'wv_value102': 'NULL',
 'wv_value103': 'NULL',
 'wv_value104': 'NULL',
 'wv_value105': 'NULL',
 'wv_value106': 'NULL',
 'wv_value107': 'NULL',
 'wv_value108': 'NULL',
 'wv_value109': 'NULL',
 'wv_value11': 'NULL',
 'wv_value110': 'NULL',
 'wv_value111': 'NULL',
 'wv_value112': 'NULL',
 'wv_value113': 'NULL',
 'wv_value114': 'NULL',
 'wv_value115': 'NULL',
 'wv_value116': 'NULL',
 'wv_value117': 'NULL',
 'wv_value118': 'NULL',
 'wv_value119': 'NULL',
 'wv_value12': 'NULL',
 'wv_val

In [40]:
%%time
X_test = [[Word2Features(s, pos) for pos in range(len(s))] for s in test_data]

Wall time: 16.3 s


# Hyperparameter Tunning

In [17]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV



In [None]:
%%time
# define fixed parameters and parameters to search
crf_CV = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.05),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf_CV, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

# Training

In [41]:
%%time
crf_final = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.01, 
    c2=0.01, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf_final.fit(X_train, y_train)

Wall time: 35.4 s


In [42]:
labels = list(crf_final.classes_)
labels.remove('O')
labels

['B', 'I']

# Testing

Run on the Training set

In [43]:
y_train_pred = crf_final.predict(X_train)
metrics.flat_f1_score(y_train, y_train_pred, 
                      average='weighted', labels=labels)

0.99434308860544485

Run on the Dev set

In [44]:
y_dev_pred = crf_final.predict(X_dev)
metrics.flat_f1_score(y_dev, y_dev_pred, 
                      average='weighted', labels=labels)

0.59239858140628943

In [45]:
print(metrics.flat_classification_report(
    y_dev, y_dev_pred, labels=labels, digits=3
))

             precision    recall  f1-score   support

          B      0.801     0.519     0.630       459
          I      0.667     0.440     0.530       273

avg / total      0.751     0.489     0.592       732



In [45]:
%%time
y_test_pred = crf_final.predict(X_test)

Wall time: 510 ms


## Generate Output file

In [21]:
def generate_output(pred, outputfile):
    f = open(outputfile,'w')
    for label_sentence in pred:
        for label_word in label_sentence:
            f.write(label_word + '\n')
        f.write("\n")
    f.close()

In [46]:
generate_output(y_dev_pred, "output-dev")

In [176]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

Top positive:
6.567432 B        lower:twitter
3.792135 B        lower:facebook
2.826828 B        lower:pope
2.580548 B        lower:youtube
2.514104 B        suffix_3:mas
2.424522 B        lower:ipad
2.371110 B        lower:iphone
2.160594 O        next_lower:bless
2.149650 I        prev_lower:fashion
2.096999 B        lower:ipod
2.092677 B        suffix_4:alds
1.999379 B        lower:taylor
1.999376 B        prev_lower:go
1.989796 B        lower:uk
1.939972 O        lower:and
1.896921 B        lower:steve
1.896921 B        suffix_4:teve
1.877813 B        suffix_4:enny
1.870439 B        suffix_2:GP
1.849041 I        lower:day
1.838438 O        lower:prison
1.820803 O        prev_lower:gaye
1.802747 B        lower:chicago
1.797612 I        prev_lower:dj
1.791227 O        lower:free
1.762508 I        suffix_2:rk
1.753413 I        prev_lower:stylez
1.753413 I        next_lower:a.k.a.
1.740830 B        suffix_2:DA
1.725471 O        BOS


In [185]:
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])


Top negative:
-1.371758 O        pos[+1]|pos[+2]:FW|NNP
-1.391221 O        lower:itunes
-1.391221 O        suffix_4:unes
-1.394320 O        suffix_3:lds
-1.410080 O        suffix_2:na
-1.416634 O        pos[0]:NNPS
-1.420462 O        next_lower:full
-1.429323 O        prev_lower:then
-1.430705 O        pos[0]|pos[+1]|pos[+2]:NN|,|CC
-1.434475 O        next_lower:was
-1.518657 O        istitle
-1.524179 B        pos[-1]|pos[0]|pos[+1]:CC|NNP|NNP
-1.534428 O        pos[+1]|pos[+2]:.|JJ
-1.558676 O        suffix_2:LE
-1.571161 O        pos[-1]|pos[0]|pos[+1]:.|NNS|.
-1.583378 B        prev_istitle
-1.591711 O        suffix_2:ix
-1.609285 O        next_lower:live
-1.628820 O        suffix_4:pope
-1.637103 O        suffix_3:Pad
-1.648974 O        suffix_4:eber
-1.651023 B        suffix_2:ay
-1.654816 O        next_lower:fans
-1.663851 O        suffix_3:ube
-1.717274 O        suffix_2:HD
-1.719929 O        next_lower:!!!!!!!
-1.838786 O        suffix_2:GP
-1.859041 O        suffix_3:ndo
-1.