In [118]:
import string
def contains_letter(w):
    for l in w:
        if l in l.lower() in 'abcdefghijklmnopqrstuvwxyz':
            return True
    return False
#sentence: [w1, w2, ...], index: the index of the word
def features(sentence, index):
    features = {}
    word = sentence[index]
    prevw = '' if index == 0 else sentence[index - 1]
    nextw = '' if index == len(sentence) - 1 else sentence[index + 1]
    return {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].upper() == word[0],
#         'is_all_caps': word.upper() == word,
#         'is_all_lower': word.lower() == word,
        'prefix-1': word[0],
        'prefix-2': word[:2],# some specific signs
        'suffix-1': word[-1],# y
        'suffix-2': word[-2:],#ed, ly
        'suffix-3': word[-3:],#ing
        'suffix-4': word[-4:],
        'digit_start': word[0] in '0123456789',
        'vowel_last': word[-1] in 'aeiouy',
        'vowels': ''.join([i for i in word if i in 'aeiou']),
        'is_punctuation': word in string.punctuation,
#         'prev_word': prevw,
#         'next_word': nextw,
#         'prev-suffix-2': prevw[-2:],
        'prev-suffix-3': prevw[-3:],
#         'next-suffix-2': nextw[-2:],
        'next-suffix-3': nextw[-3:],        
        'contains_letter': contains_letter(word),
        'is_numeric': word.isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence', '234,53'], 3))
 

{'capitals_inside': False,
 'contains_letter': True,
 'digit_start': False,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'is_punctuation': False,
 'next-suffix-3': ',53',
 'prefix-1': 's',
 'prefix-2': 'se',
 'prev-suffix-3': 'a',
 'suffix-1': 'e',
 'suffix-2': 'ce',
 'suffix-3': 'nce',
 'suffix-4': 'ence',
 'vowel_last': True,
 'vowels': 'eee',
 'word': 'sentence'}


In [119]:
def remove_tags(tagged_sentence):
    return [w for t, w in tagged_sentence] 

In [120]:
def transform_to_sentences(filename):
    raw = open(filename)
    tagget_sentences = []
    sentence = []
    for line in raw:
        if(line == '\n' or line == ''):
            tagget_sentences.append(sentence)
            sentence = []
            continue
        sentence.append(line.strip().split('\t'))
    return tagget_sentences

In [121]:
tags = []
raw = open('pos_train.conll')
sentence = []
for line in raw:
    if(line == '\n' or line.strip() == ''):
        continue
    x, y = line.strip().split('\t')
    if x not in tags:
        tags.append(x)
        print(line)
print(tags)
    

NNP	Antick

NN	post

:	:

CD	2010

(	(

)	)

IN	by

URL	http://bit.ly/as8fvc

RT	RT

USR	@DIVINEDYNASTY

HT	#THEREDROOM

.	.

WRB	when

PRP	i

VBP	compliment

MD	wo

RB	n't

VB	believe

UH	&lt;

VBG	doin

JJ	good

,	,

CC	&

PRP$	my

DT	a

JJS	Most

NNS	things

VBZ	seems

RBR	more

VBN	Thrilled

VBD	increased

TO	to

RP	out

EX	there

POS	's

WP	what

WDT	that

FW	Etc

JJR	darker

''	"

NNPS	kids

SYM	&lt;

RBS	most

VPP	please

O	"..

LS	1

TD	a

['NNP', 'NN', ':', 'CD', '(', ')', 'IN', 'URL', 'RT', 'USR', 'HT', '.', 'WRB', 'PRP', 'VBP', 'MD', 'RB', 'VB', 'UH', 'VBG', 'JJ', ',', 'CC', 'PRP$', 'DT', 'JJS', 'NNS', 'VBZ', 'RBR', 'VBN', 'VBD', 'TO', 'RP', 'EX', 'POS', 'WP', 'WDT', 'FW', 'JJR', "''", 'NNPS', 'SYM', 'RBS', 'VPP', 'O', 'LS', 'TD']


In [122]:
test_sentences = transform_to_sentences('pos_test.conll')
train_sentences = transform_to_sentences('pos_train.conll')
print(test_sentences[0])
print(train_sentences[0])

[['CD', '2'], ['NNS', 'days'], ['IN', 'til'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['HT', '#houseofmirrors'], ['.', '!']]
[['NNP', 'Antick'], ['NNP', 'Musings'], ['NN', 'post'], [':', ':'], ['NNP', 'Book-A-Day'], ['CD', '2010'], ['NN', '#'], ['CD', '243'], ['(', '('], ['CD', '10/4'], [')', ')'], [':', '--'], ['NNP', 'Gray'], ['NNP', 'Horses'], ['IN', 'by'], ['NNP', 'Hope'], ['NNP', 'Larson'], ['URL', 'http://bit.ly/as8fvc']]


In [123]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(remove_tags(tagged), index))
            y.append(tagged[index][0])
 
    return X, y

X, y = transform_to_dataset(train_sentences)


In [125]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X, y)
 
X_test, y_test = transform_to_dataset(test_sentences)

print("Accuracy:", clf.score(X_test, y_test))


# Accuracy: 0.814005352364 without prev and next

Accuracy: 0.814005352364


In [126]:
print(clf)

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])


In [127]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

         ''       1.00      1.00      1.00        10
          (       0.67      1.00      0.80         4
          )       1.00      1.00      1.00         6
          ,       1.00      1.00      1.00        44
          .       0.97      0.99      0.98       125
          :       0.94      0.94      0.94        81
         CC       0.90      0.98      0.93        44
         CD       0.74      0.94      0.83        36
         DT       0.92      0.95      0.94       115
         EX       1.00      0.25      0.40         4
         HT       1.00      1.00      1.00        26
         IN       0.93      0.90      0.91       168
         JJ       0.73      0.50      0.60       119
        JJR       0.75      0.75      0.75         4
        JJS       1.00      0.33      0.50         3
         MD       1.00      0.96      0.98        27
         NN       0.70      0.74      0.72       287
        NNP       0.59      0.69      0.63   

  'precision', 'predicted', average, warn_for)


In [128]:
y_pred = clf.predict(X_test)

In [129]:
for m, n, k in zip([x['word'] for x in X_test], y_pred, y_test):
    if (n != k):
        print(m,'f:'+ n,'t:'+ k)

case f:IN t:NN
JV f:NNP t:NN
leads f:NNS t:VBZ
4-1 f:CD t:JJ
scores f:NNS t:VBZ
New f:NNP t:JJ
TheDeAndreWay f:URL t:NNP
The f:NNP t:DT
souljaboytellem-iga.ning.com f:NN t:URL
born f:NN t:VBN
RETWEET f:NNP t:VB
Funniest f:NN t:JJS
heard f:NN t:VBD
Wingo f:UH t:NNP
wood f:NN t:NNP
scared f:VBD t:VBN
came f:NN t:VBD
pandora f:NN t:NNP
tho f:IN t:RB
so f:CC t:IN
yo f:UH t:PRP$
tweet f:VB t:NN
*kisses f:NNP t:VBZ
ass**sneezes f:VBZ t:NN
after* f:NN t:RB
then f:RB t:IN
hp6 f:NN t:NNP
So f:RB t:UH
chi f:VBG t:NNP
whts f:NNP t:WP
Fuck f:JJ t:UH
FRIDAY f:VB t:NNP
wet f:VB t:JJ
cold f:VBD t:JJ
High f:NNP t:JJ
low f:IN t:JJ
Goin f:NNP t:VBG
BEST f:NNP t:JJS
friends f:NNS t:NN
party f:NNP t:NN
As f:UH t:IN
see f:VBP t:VB
taylor f:NN t:NNP
~HAPPY f:NNP t:JJ
TAYLOR f:NN t:NNP
LUVZ f:NNP t:VBP
Hey f:DT t:UH
both f:NN t:DT
drafted f:VBD t:VBN
stared f:VBN t:VBD
live f:VB t:VBP
awaiting f:JJ t:VBG
become f:IN t:VBP
Amber f:VBD t:NNP
Le f:PRP t:NNP
LFW f:NN t:NNP
Be f:VB t:VBP
15th f:CD t:JJ
get f:VB t