In [15]:
import spacy
from sklearn.base import TransformerMixin

# Create a spaCy parser
nlp = spacy.load('en')


class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document, tag=False, parse=False, entity=False)):
                if len(word.text.strip()):
                    row[word.text] = True
            results.append(row)
        return results

In [22]:
from sklearn.feature_extraction import DictVectorizer

In [23]:
from sklearn.naive_bayes import BernoulliNB

In [24]:
import os
input_filename = os.path.join(os.path.expanduser("~"), "data/datasets", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "data/datasets", "twitter", "python_classes.json")

In [31]:
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

In [32]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

In [33]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
#We then print out the average of the scores:
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.684


In [35]:
model = pipeline.fit(tweets, labels)

In [38]:
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

In [39]:
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

In [40]:
dv = model.named_steps['vectorizer']

In [41]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

0 : 0.53125
1 # 0.51875
2 Python 0.4875
3 python 0.40625
4 RT 0.26875
5 in 0.21875
6 - 0.2
7 to 0.19375
8 , 0.1875
9 for 0.175
10 and 0.1375
11 . 0.125
12 ? 0.11875
13 the 0.10625
14 ) 0.10625
15 ( 0.10625
16 of 0.1
17 with 0.1
18 I 0.08125
19 a 0.08125
20 A 0.06875
21 via 0.06875
22 jobs 0.0625
23 ! 0.05625
24 an 0.05625
25 from 0.05
26 How 0.05
27 Data 0.05
28 this 0.05
29 Developer 0.05
30 data 0.05
31 current 0.04375
32 installing 0.04375
33 Top 0.04375
34 by 0.04375
35 library 0.04375
36 status 0.04375
37 30 0.0375
38 And 0.0375
39 C++ 0.0375
40 Tech 0.0375
41 Job 0.0375
42 or 0.0375
43 looking 0.0375
44 3 0.0375
45 [ 0.0375
46 ] 0.0375
47 @shiftkey 0.0375
48 Django 0.0375
49 Engineer 0.0375
