In [67]:
from csv import DictReader, DictWriter

import numpy as np
from numpy import array
from random import shuffle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

import nltk

kTARGET_FIELD = 'spoiler'
kTEXT_FIELD = 'sentence'

def f(examples):
    ex = []
    for sent in examples:
        newSent = nltk.pos_tag(sent.split())
        ex.append(" ".join([i[0] + ":" + i[1] for i in newSent]))
#         ex.append(" ".join([i[1] for i in newSent]))
    print(ex[0])
    return ex

def get_lists(examples):
    base_examples = list(examples)
    tags_examples = [] # For nltk based features
    for sent in base_examples:
        newSent = list(nltk.pos_tag(sent.split()))
        tags_examples.append(newSent)
    return base_examples, tags_examples
    
def get_pos_only(examples):
    pos_examples = []
    for sent in examples:
        newSent = " ".join(sent[x][1] for x in range(len(sent)))
        #print(newSent)
        pos_examples.append(newSent)
    return pos_examples

def get_words_pos(examples):
    words_pos = []
    for sent in examples:
        newSent = " ".join("%s(%s)" % (sent[x][0].replace(".",""), sent[x][1]) for x in range(len(sent)))
        #print(newSent)
        words_pos.append(newSent)
    return words_pos
        
        
    

class Featurizer:
    def __init__(self):
        self.vectorizer = CountVectorizer(ngram_range=(1,3), token_pattern = "[a-zA-Z]+(\([a-zA-Z]+\))?")

    def train_feature(self, examples):
        base_examples, tags_examples = get_lists(examples)
            
        # Fit examples containing only pos tags
        pos_only = get_pos_only(tags_examples)
        #self.vectorizer.fit(pos_only)
        
        #Fit examples containing word pos-tag pairs
        words_pos = get_words_pos(tags_examples)
        #self.vectorizer.fit(words_pos)
        ex = [" ".join([base_examples[x],pos_only[x],words_pos[x]]) for x in range(len(base_examples))]
        
        #print(ex)
        
        return self.vectorizer.fit_transform(ex)

    def test_feature(self, examples):
        base_examples, tags_examples = get_lists(examples)
        pos_only = get_pos_only(tags_examples)
        words_pos = get_words_pos(tags_examples)
        ex = [" ".join([base_examples[x],pos_only[x],words_pos[x]]) for x in range(len(base_examples))]
        return self.vectorizer.transform(ex)

    def show_top10(self, classifier, categories):
        feature_names = np.asarray(self.vectorizer.get_feature_names())
        if len(categories) == 2:
            top10 = np.argsort(classifier.coef_[0])[-10:]
            bottom10 = np.argsort(classifier.coef_[0])[:10]
            print("Pos: %s" % " - ".join(feature_names[top10]))
            print("Neg: %s" % " - ".join(feature_names[bottom10]))
        else:
            for i, category in enumerate(categories):
                top10 = np.argsort(classifier.coef_[i])[-10:]
                print("%s: %s" % (category, " ".join(feature_names[top10])))


# Cast to list to keep it all in memory
trainset = list(DictReader(open("../data/spoilers/train.csv", 'r')))
test = list(DictReader(open("../data/spoilers/test.csv", 'r')))

# Split training data into train validate and holdout sets
# n = len(trainset)
# shuffle(trainset)
# train = trainset[:8*n//10]
# validate = trainset[6*n//10:8*n//10]
# h_out = trainset[8*n//10:]
# train = trainset[:100]
train = trainset

feat = Featurizer()

labels = []
for line in train:
    if not line[kTARGET_FIELD] in labels:
        labels.append(line[kTARGET_FIELD])

print("Label set: %s" % str(labels))
x_train = feat.train_feature(x[kTEXT_FIELD] for x in train)
x_test = feat.test_feature(x[kTEXT_FIELD] for x in test)
# x_validate = feat.test_feature(x[kTEXT_FIELD] for x in validate)
# x_h_out = feat.test_feature(x[kTEXT_FIELD] for x in h_out)

y_train = array(list(labels.index(x[kTARGET_FIELD])
                     for x in train))

print(len(train), len(y_train))
print(set(y_train))


Label set: ['False', 'True']
(14784, 14784)
set([0, 1])


In [69]:
print(x_test.shape)

(1477, 9166)


In [83]:
# Train classifier
lr = SGDClassifier(loss='log', penalty='l2', shuffle=True)
lr.fit(x_train, y_train)

feat.show_top10(lr, labels)

predictions = lr.predict(x_test)
o = DictWriter(open("predictions.csv", 'w'), ["Id", "spoiler"])
o.writeheader()
for ii, pp in zip([x['Id'] for x in test], predictions):
    d = {'Id': ii, 'spoiler': labels[pp]}
    o.writerow(d)

Pos: (cc) - (vbz) -  (nn) (vbd) -  (nn) - (nn)  - (nnp) -  (nnp) (nn) -   (jj) -    - (rp)
Neg: (nnp) (nnp) -   (nnp) - (nn) -   - (jj) -  (nnp) (nnp) - (vbp) - (jjr) - (nn) (nnp) - (jj) (nns)


In [70]:
pred_validate = lr.predict(x_validate)
y_validate = []
for i in validate:
    if(i["spoiler"] == "True"): y_validate.append(1)
    else: y_validate.append(0)
# print(pred_validate)
# print(y_validate)
count = 0
for i in range(len(validate)):
    if(pred_validate[i] == y_validate[i]): 
        count += 1       
print(float(count)/len(validate))

ValueError: X has 8630 features per sample; expecting 9166

In [56]:
pred_h_out = lr.predict(x_validate)
y_h_out = []
for i in h_out:
    if(i["spoiler"] == "True"): y_h_out.append(1)
    else: y_h_out.append(0)
        
count = 0
for i in range(len(h_out)):
    if(pred_h_out[i] == y_h_out[i]): 
        count += 1       
print(float(count)/len(h_out))

0.523165370308
