In [1]:
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report

# Prepare data

In [2]:
with open("finalDataset/uriPOSTagged.tsv", 'r') as fp:
	data = fp.readlines()

In [3]:
for i in range(len(data)):
	data[i] = data[i].strip('\n')
	data[i] = data[i].split('\t')

In [4]:
tweets = []
currPoint = []

for token in data:
    if len(token) == 1 and token[0] == '':
        if len(currPoint) > 0:
            tweets.append(currPoint)
            currPoint = []
    else:
        currPoint.append(token)

In [5]:
len(tweets)

909

# Feature functions

In [6]:
def asciiPercentage(s):
	count = 0.
	for char in s:
		if ord(char) < 128:
			count += 1
	return count/len(s)

def vowelPercentage(s):
	vowels = "aeiou"
	count = 0.
	for char in s:
		if char in vowels:
			count += 1
	return count/len(s)

# Feature extractor

In [7]:
def word2features(sent, i):

	# feature vector
	# word, pos, lang

    word = sent[i][0]
    wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
    normalizedWord = wordClean.lower()
    
    anyCap = any(char.isupper() for char in word)
    allCap = all(char.isupper() for char in word)
    hasSpecial = any( ord(char) > 32 and ord(char) < 65 for char in word)
    
    hashTag = word[0] == '#'
    mention = word[0] == '@'
    
    
    features = {'word' : word, 'wordClean' : wordClean, 'normalizedWord' : normalizedWord, \
                'isTitle' : word.istitle(), 'wordLength' : len(word), \
                'anyCap' : anyCap, 'allCap' : word.isupper(),
                'hasSpecial' : hasSpecial, 'asciiPer' : asciiPercentage(word)}
    
    
    features['suffix5'] = word[-5:]
    features['prefix5'] = word[:5]
    features['suffix4'] = word[-4:]
    features['prefix4'] = word[:4]
    features['suffix3'] = word[-3:]
    features['prefix3'] = word[:3]
    features['suffix2'] = word[-2:]
    features['prefix2'] = word[:2]
    features['suffix1'] = word[-1:]
    features['prefix1'] = word[:1]  
    
    
    if i > 0:

        word = sent[i - 1][0]
        wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
        normalizedWord = wordClean.lower()

        features['-1:word'] = word
        features['-1:wordClean'] = wordClean
        features['-1:normalizedWord'] = normalizedWord
        features['BOS'] = False

    else:
        features['-1:word'] = ''
        features['-1:wordClean'] = ''
        features['-1:normalizedWord'] = ''
        features['BOS'] = True


    if i < len(sent) - 1:

        word = sent[i + 1][0]
        wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
        normalizedWord = wordClean.lower()

        features['+1:word'] = word
        features['+1:wordClean'] = wordClean
        features['+1:normalizedWord'] = normalizedWord
        features['EOS'] = False
    else:
        features['+1:word'] = ''
        features['+1:wordClean'] = ''
        features['+1:normalizedWord'] = ''
        features['EOS'] = True

    return features


# Extracting features from Sequences

In [8]:
def sent2features(sent):
	features = []

	for i in range(len(sent)):
		features.append(word2features(sent, i))

	return features

def sent2labels(sent):
	allLabels = []

	for i in sent:
		allLabels.append(i[2])

	return allLabels

def sent2tokens(sent):

	allTokens = []

	for i in sent:
		allTokens.append(i[0])

	return allTokens

# Training and Testing (5 - fold)

In [9]:
# Params; obtained from Grid Search

c1 = 0.0001
c2 = 0.1

In [10]:
k = 5

chunk = len(tweets) / k
results = []

allTestPredictions = []
allTestGroundTruth = []

for i in range(k):

    print "cross validation", i, 'for', 'c1 :', c1, 'c2 :', c2

    test_sents = tweets[i * chunk : (i + 1) * chunk]
    train_sents = tweets[:i * chunk] + tweets[(i + 1) * chunk:]

    print "--> Extracting Train Set ..."
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    print "--> Extracting Test Set ..."
    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    print "--> Loading CRF module ..."
    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': c1,   # coefficient for L1 penalty
        'c2': c2,  # coefficient for L2 penalty
        'max_iterations': 1000,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True,
        'feature.possible_states' : True
    })

    print "Training ..."
    trainer.train('pos_uri_' + str(i))

    print "Testing ..."
    tagger = pycrfsuite.Tagger()
    tagger.open('pos_uri_' + str(i))

    y_pred = []

    for xseq in X_test:
        y_pred.append(tagger.tag(xseq))

    """ CRF based classification """

    predictedLabels = []
    correctLabels = []

    for i in y_pred:
        for j in i:
            predictedLabels.append(j)
            allTestPredictions.append(j)

    for i in y_test:
        for j in i:
            correctLabels.append(j)
            allTestGroundTruth.append(j)

print """ CRF Classification"""
print 'c1 :', c1, 'c2 :', c2
print classification_report(allTestGroundTruth, allTestPredictions, digits = 4)

cross validation 0 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 1 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 2 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 3 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 4 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification
c1 : 0.0001 c2 : 0.1
             precision    recall  f1-score   support

        ADJ     0.6875    0.6497    0.6681       982
        ADP     0.9147    0.9197    0.9172      1818
        ADV     0.8130    0.6924  