In [86]:
import gensim
from configparser import ConfigParser
from pymongo import MongoClient
import re
import string
import warnings
import os
from tabulate import tabulate

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    import spacy
    nlp = spacy.load('en')
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from collections import defaultdict, Counter

os.chdir('/Users/tmoeller/ds/twitter-project/partisan-tweets')
import src.data.aws_ec2_functions as aws

In [64]:
# Make sure AWS Ec2 Instance is running and get public IP address
instance = aws.fetch_instances()[0]

if aws.instance_state(instance) != 'running':
    print('Starting {} instance now'.format(instance.public_ip_address))
    aws.start_instance(instance, safety=False)

In [20]:
punctuations = string.punctuation
config = ConfigParser()
config.read('config.ini')

['config.ini']

In [65]:
client = MongoClient("mongodb://{}:{}@{}/{}".format(
                        config.get('MongoConfig', 'user'),
                        config.get('MongoConfig', 'password'),
                        instance.public_ip_address,
                        config.get('MongoConfig', 'db'),
                        int(config.get('MongoConfig', 'port'))))

db = client.twitter_db

In [136]:
def fetch_train_test(train_group, test_group):
    cursor = db.train_test_dict.find({'group': train_group}, { 'screen_names': 1, '_id': 0 })
    train_group = [doc for doc in cursor]

    cursor = db.train_test_dict.find({'group': test_group}, { 'screen_names': 1, '_id': 0 })
    test_group = [doc for doc in cursor]

    cursor = db.fav_tweets_dict.find( { "name": { "$in": train_group[0]['screen_names'] } } )
    train = [doc for doc in cursor]

    cursor = db.legislator_tweets_dict.find( { "name": { "$in": test_group[0]['screen_names'] } } )
    test = [doc for doc in cursor]

    return train, test


def unlist_tweets(list_tweet_dicts):
    tweets_labels = []

    for dict in list_tweet_dicts:
        for tweet in dict['tweets']:
            tweets_labels.append([tweet, dict['party']])

    return tweets_labels


def clean_tweet(tweet):
    """
    Function to remove urls, numbers and punctuation, and make lowercase
    """
    no_url = re.sub(r'http\S+', '', tweet)
    clean = re.sub(r'[^\w\s]', '', no_url)

    result = ''.join([str(i).replace('\n', ' ').lower() for i in clean if not i.isdigit()])

    return result


def spacy_tokenizer(tweet):
    """
    Utility function to remove stopwords, ignore pronouns and tokenize words before vectorizing
    """
    doc = nlp(tweet)
    tokens = [token.orth_ for token in doc if not token.is_stop]

    return tokens

In [90]:
train_a, test_b = fetch_train_test('train_a', 'test_b')

train_data = unlist_tweets(train_a)
test_data = unlist_tweets(test_b)

In [91]:
print(len(train_data))
print(len(test_data))

143531
13610


In [92]:
clean_train = [clean_tweet(tweet[0]) for tweet in train_data]
clean_test = [clean_tweet(tweet[0]) for tweet in test_data]

y_train = [1 if tweet[1]=='R' else 0 for tweet in train_data]
y_test = [1 if tweet[1]=='R' else 0 for tweet in test_data]

In [61]:
Counter(y_train)

Counter({0: 64598, 1: 78933})

In [25]:
tokenized_train = [spacy_tokenizer(tweet) for tweet in clean_train]
tokenized_test = [spacy_tokenizer(tweet) for tweet in clean_test]

In [121]:
np.shape(np.array(tokenized_train).reshape(-1,1))

(143531, 1)

In [128]:
model = gensim.models.Word2Vec(tokenized_train, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

  


In [139]:
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(np.array(tokenized_train))

TfidfVectorizer(analyzer=<function <lambda> at 0x125c39950>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [140]:
max_idf = max(tfidf.idf_)

In [142]:
max_idf

12.18116610527752

In [144]:
len(tfidf.vocabulary_.items())

110672

In [145]:
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()][:10]

[('carter', 8.086821543055418),
 ('film', 7.458212883633045),
 ('old', 6.361083174925158),
 ('crime', 6.990990897349186),
 ('drama', 8.763439421664154),
 ('starring', 9.783270832479149),
 ('michael', 7.0691783169209765),
 ('caine', 12.18116610527752),
 ('man', 5.165004952451596),
 ('returned', 8.715430202477794)]

In [152]:
w2v['crime']

array([ 0.65757465, -0.43149385, -0.173158  , -0.05396891,  0.9756762 ,
       -0.56764764,  0.34622863, -0.9952162 ,  0.12716042, -0.6889521 ,
       -0.36966366,  0.23572905, -0.1086284 , -0.02477333,  0.8455168 ,
        0.5322241 ,  0.09200475, -0.14125866, -0.5532329 , -0.21689826,
       -0.26331687, -0.20779441,  0.70469165,  0.43020454,  0.6197007 ,
       -0.7655905 , -0.6719425 ,  0.3089493 ,  0.06360037,  0.4469036 ,
       -0.2455574 , -0.19447382, -0.02673647, -0.17551348,  0.8663641 ,
        0.5635876 ,  0.09866684, -0.41125426,  0.71162885, -0.5757415 ,
       -0.37985703,  0.37381154, -0.26874182, -0.75298285, -0.45720792,
       -0.24957158, -0.5306569 ,  0.15163025,  0.14710161, -0.12933388,
       -0.17411111,  0.36777452,  0.8360838 , -0.50275666,  0.3028484 ,
        0.2657561 ,  0.23719683,  1.0456861 , -0.4118023 , -0.5747189 ,
       -1.2507479 , -0.6502701 ,  0.12593256, -0.45259246, -0.7551309 ,
       -0.36980247,  0.67286927, -0.93774974, -0.08849384, -0.53

In [161]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [156]:
test = tokenized_train[0]
print(test, '\n')

tf_idf_test = TfidfEmbeddingVectorizer(w2v)
test_fit = tf_idf_test.fit(test)
test_trans = tf_idf_test.transform(test)
print(test_trans)


['carter', 'film', 'old', 'crime', 'drama', 'starring', 'michael', 'caine', 'man', 'returned', 'home', 'investigate'] 

[[ 0.4478975  -0.8819539   0.03672072 ...  1.2280468  -0.3594811
   0.8130759 ]
 [ 0.01542316 -0.22520828  0.03162662 ...  2.3986938  -1.664752
   0.13381721]
 [ 0.7504242  -0.8463364   0.08991772 ...  1.9839908  -0.37392592
   0.17811705]
 ...
 [ 0.67234826 -0.79537976  0.15654385 ...  1.470461   -0.34150916
   0.6871164 ]
 [ 0.4761072  -0.5334214  -0.00780101 ...  2.139413   -1.1525741
   0.8540492 ]
 [ 0.70783883 -0.65458965 -0.31876636 ...  1.3232678  -0.19453858
   0.64163744]]


In [158]:
tf_idf_test2 = TfidfEmbeddingVectorizer(w2v)
embeddings = tf_idf_test2.fit(test).transform(test)

array([[ 0.4478975 , -0.8819539 ,  0.03672072, ...,  1.2280468 ,
        -0.3594811 ,  0.8130759 ],
       [ 0.01542316, -0.22520828,  0.03162662, ...,  2.3986938 ,
        -1.664752  ,  0.13381721],
       [ 0.7504242 , -0.8463364 ,  0.08991772, ...,  1.9839908 ,
        -0.37392592,  0.17811705],
       ...,
       [ 0.67234826, -0.79537976,  0.15654385, ...,  1.470461  ,
        -0.34150916,  0.6871164 ],
       [ 0.4761072 , -0.5334214 , -0.00780101, ...,  2.139413  ,
        -1.1525741 ,  0.8540492 ],
       [ 0.70783883, -0.65458965, -0.31876636, ...,  1.3232678 ,
        -0.19453858,  0.64163744]], dtype=float32)

In [165]:
cross_val_score(ExtraTreesClassifier(n_estimators=200), 
                test_trans, 
                np.array(y_train), 
                cv=3, scoring='roc_auc')

ValueError: Found input variables with inconsistent numbers of samples: [12, 143531]

In [167]:
np.shape(test_trans)

(12, 100)

In [168]:
np.shape(np.array(y_train))

(143531,)

In [125]:
multi_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), 
                     ("multinomial nb", MultinomialNB())])
gaus_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), 
                          ("gaussian nb", GaussianNB())])

In [169]:
whos

Variable                   Type                          Data/Info
------------------------------------------------------------------
BernoulliNB                ABCMeta                       <class 'sklearn.naive_bayes.BernoulliNB'>
ConfigParser               ABCMeta                       <class 'configparser.ConfigParser'>
CountVectorizer            type                          <class 'sklearn.feature_e<...>on.text.CountVectorizer'>
Counter                    type                          <class 'collections.Counter'>
ExtraTreesClassifier       ABCMeta                       <class 'sklearn.ensemble.<...>st.ExtraTreesClassifier'>
GaussianNB                 ABCMeta                       <class 'sklearn.naive_bayes.GaussianNB'>
MeanEmbeddingVectorizer    type                          <class '__main__.MeanEmbeddingVectorizer'>
MongoClient                type                          <class 'pymongo.mongo_client.MongoClient'>
MultinomialNB              ABCMeta                       <class

In [162]:
etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_models = [
    ("multi_nb", multi_nb),
    ("w2v", etree_w2v),
    ("w2v_tfidf", etree_w2v_tfidf)
]

unsorted_scores = []
for name, model in all_models:
    name_score = (name, cross_val_score(model, 
                                        np.array(tokenized_train), 
                                        np.array(y_train), 
                                        cv=3, scoring='roc_auc').mean())
    print(name_score)
    unsorted_scores.append(name_score)

scores = sorted(unsorted_scores, key=lambda x: -x[1])

In [None]:
print (tabulate(scores, floatfmt=".4f", headers=("model", 'score')))

## Test models on Legislator data

In [73]:
cursor = db.legislator_tweets_dict.find( {} )
legislators = [doc for doc in cursor]

In [74]:
data = unlist_tweets(legislators)

In [75]:
len(data)

29891

In [82]:
clean = [clean_tweet(tweet[0]) for tweet in data]
y = [1 if tweet[1]=='R' else 0 for tweet in data]

In [77]:
print(len(clean))
clean[:10]

29891


['talofa american samoas cadca group stopped by our dc office to talk about drugfree communities and see the capitol thanks and keep up the good work ',
 'talofa today my colleagues and i in the housevetaffairs held an oversight hearing examining how recent legislation passed by congress has been implemented by the va read my pr here ',
 'talofa im pleased to announce that the epa has awarded american samoa a grant for diesel emissions reduction this is the second grant that we have received from them in two weeks read my pr here  ',
 'as chair of the health amp technology subcommittee cyber security is one of the most important issues facing our small businesses i remain committed in ensuring that our small businesses have the resources they need to protect themselves from cyber attacks ',
 'before your weekend begins check out my top photos from this week in indiana   ',
 'important information for anyone impacted by flooding follow idhs for more updates ',
 'visited mckinney farms i

In [78]:
tokenized = [spacy_tokenizer(tweet) for tweet in clean]

In [79]:
model2 = gensim.models.Word2Vec(tokenized, size=100)
w2v_2 = dict(zip(model2.wv.index2word, model2.wv.syn0))

  


In [107]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), 
                    ("bernoulli nb", MultinomialNB())])
gaus_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), 
                          ("bernoulli nb", GaussianNB())])

etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_2)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v_2)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [110]:
all_models = [
    ("mult_nb", mult_nb),
    ("w2v", etree_w2v),
    ("w2v_tfidf", etree_w2v_tfidf)
]


unsorted_scores = [(name, cross_val_score(model, tokenized, y, cv=3, scoring='roc_auc').mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])

TypeError: Singleton array array(0) cannot be considered a valid collection.

In [None]:
print (tabulate(scores, floatfmt=".4f", headers=("model", 'score')))