# Text Processing with spaCy

In [182]:
import gzip
import pickle
import numpy as np
from configparser import ConfigParser
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import re
import string
punctuations = string.punctuation

# spaCy
import spacy
nlp = spacy.load('en')

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.preprocessing import Normalizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Read in tweet data from Mongodb

In [11]:
config = ConfigParser()
config.read('../config.ini')

client = MongoClient("mongodb://{}:{}@{}/{}".format(
                        config.get('MongoConfig', 'user'),
                        config.get('MongoConfig', 'password'),
                        config.get('MongoConfig', 'host'),
                        config.get('MongoConfig', 'db')
                        ), int(config.get('MongoConfig', 'port')))

In [12]:
db = client.twitter_db

In [13]:
db.collection_names()

['Init', 'tweets', 'favorited_tweets']

In [14]:
db.favorited_tweets.count()

695324

In [31]:
cursor = db.favorited_tweets.find({}).limit(10000)

In [32]:
tweet_data = [document  for document in cursor]

In [77]:
rawtext = [tweet['full_text'] for tweet in tweet_data]

rawtext[:10]

["100% correct. Congress must still act. This Administration will end DACA and has the authority to do so -- it just didn't do it legally. Yet. https://t.co/HxiDhioeMY",
 '#DACA renewals continue. Sessions &amp; DOJ overreached to skip Courts of Appeal. Congress must still act. #CleanDreamActNow #DreamActNow #Dreamers https://t.co/jUrwbqNU6p',
 'More courageous young people leading! #DreamActNow #CleanDreamAct https://t.co/WT0OuZbOVR',
 "Philando Castile's mom slams NRA chief LaPierre as a hypocrite - NY Daily News https://t.co/RLpLAw4VzX",
 'From one of the teen activists from #Parkland #BanAssaultWeapons #BoycottNRA https://t.co/RjnUwmCypF',
 '&lt;em&gt;Black Panther&lt;/em&gt; Actor Bambadjan Bamba Is a Dreamer https://t.co/r3CFWoVAAR\n#DreamActNow',
 'As an immigration attorney for 20 years - let\'s be clear: Melania\'s parents got greencards through Melania\'s family petition once she became US Citizen. Pres Trump\'s proposals and critique of "chain migration" would close the door

## Normalize Text

#### Clean, Tokenize and Vectorize (maybe Stem, Lemmatize)

In [135]:
def clean_tweet(tweet):
    """
    Function to remove urls, numbers and punctuation, and make lowercase
    """
    no_url = re.sub(r'http\S+', '', tweet)
    clean = re.sub(r'[^\w\s]', '', no_url)
    
    result = ''.join([i.replace('\n', ' ').lower() for i in clean if not i.isdigit()])

    return result

In [136]:
cleantext = [clean_tweet(i) for i in rawtext]
print(rawtext[1], '\n')
print(newtext[1])

#DACA renewals continue. Sessions &amp; DOJ overreached to skip Courts of Appeal. Congress must still act. #CleanDreamActNow #DreamActNow #Dreamers https://t.co/jUrwbqNU6p 

daca renewals continue sessions amp doj overreached to skip courts of appeal congress must still act cleandreamactnow dreamactnow dreamers 


In [165]:
def spacy_tokenizer(tweet):
    """
    Utility function to remove stopwords, ignore pronouns and tokenize words before vectorizing
    """
    doc = nlp(tweet)
    tokens = [token.orth_ for token in doc if 
              (token.orth_ not in stopwords and token.lemma_ != "-PRON-")]
            
    return tokens

In [193]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, 
                             ngram_range=(1,1))

In [194]:
vectorizer.fit(cleantext)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function spacy_tokenizer at 0x144c13e18>, use_idf=True,
        vocabulary=None)

In [195]:
counts = vectorizer.transform(cleantext).transpose()

In [196]:
np.shape(counts)

(16369, 10000)

In [197]:
pd.DataFrame(counts.toarray(), vectorizer.get_feature_names()).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
,0.14053,0.0,0.0,0.060122,0.0,0.06541,0.083165,0.0,0.097311,0.0,...,0.094963,0.074603,0.081438,0.077199,0.092967,0.086079,0.077577,0.081012,0.092939,0.103286
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
corpus = matutils.Sparse2Corpus(counts)

In [199]:
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [200]:
len(id2word)

16369

In [201]:
with open('../data/processed/word_features.pkl', 'wb') as wf:
    pickle.dump(id2word, wf)

## Dimensionality Reduction and Topic Modeling

### Try LDA

In [202]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)

2018-02-28 18:21:39,190 : INFO : using symmetric alpha at 0.1
2018-02-28 18:21:39,192 : INFO : using symmetric eta at 0.1
2018-02-28 18:21:39,196 : INFO : using serial LDA version on this node
2018-02-28 18:21:39,963 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-28 18:21:39,971 : INFO : PROGRESS: pass 0, at document #2000/10000
2018-02-28 18:21:41,113 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-02-28 18:21:41,199 : INFO : topic #3 (0.100): 0.017*" " + 0.011*"tv" + 0.004*"county" + 0.003*"atlanta" + 0.003*"shooting" + 0.003*"pm" + 0.003*"missing" + 0.003*"police" + 0.003*"flood" + 0.003*"schools"
2018-02-28 18:21:41,204 : INFO : topic #8 (0.100): 0.015*" " + 0.006*"tv" + 0.004*"youtube" + 0.004*"  " + 0.004*"atlanta" + 0.004*"shooti

In [203]:
lda.print_topics()

2018-02-28 18:22:30,161 : INFO : topic #0 (0.100): 0.010*" " + 0.007*"true" + 0.005*"good" + 0.005*"funder" + 0.004*"complicit" + 0.003*"school" + 0.003*"media" + 0.003*"la" + 0.003*"social" + 0.003*"telemundopr"
2018-02-28 18:22:30,166 : INFO : topic #1 (0.100): 0.011*" " + 0.004*"nt" + 0.004*"therickydavila" + 0.004*"nra" + 0.003*"thank" + 0.003*"ca" + 0.003*"war" + 0.003*"que" + 0.003*"support" + 0.003*"wo"
2018-02-28 18:22:30,167 : INFO : topic #2 (0.100): 0.009*" " + 0.005*"msnbc" + 0.004*"dumptrump" + 0.004*"thanks" + 0.004*"rd" + 0.003*"flooding" + 0.003*"democratic" + 0.003*"central" + 0.003*"stormtrack" + 0.003*"workers"
2018-02-28 18:22:30,169 : INFO : topic #3 (0.100): 0.014*" " + 0.005*"billy" + 0.005*"graham" + 0.005*"february" + 0.004*"valley" + 0.004*"cj_disabledvet" + 0.004*"hell" + 0.004*"gold" + 0.004*"weather" + 0.004*"pm"
2018-02-28 18:22:30,170 : INFO : topic #4 (0.100): 0.012*"safetypindaily" + 0.009*" " + 0.004*"like" + 0.003*"red" + 0.003*"season" + 0.003*"ll" +

[(0,
  '0.010*" " + 0.007*"true" + 0.005*"good" + 0.005*"funder" + 0.004*"complicit" + 0.003*"school" + 0.003*"media" + 0.003*"la" + 0.003*"social" + 0.003*"telemundopr"'),
 (1,
  '0.011*" " + 0.004*"nt" + 0.004*"therickydavila" + 0.004*"nra" + 0.003*"thank" + 0.003*"ca" + 0.003*"war" + 0.003*"que" + 0.003*"support" + 0.003*"wo"'),
 (2,
  '0.009*" " + 0.005*"msnbc" + 0.004*"dumptrump" + 0.004*"thanks" + 0.004*"rd" + 0.003*"flooding" + 0.003*"democratic" + 0.003*"central" + 0.003*"stormtrack" + 0.003*"workers"'),
 (3,
  '0.014*" " + 0.005*"billy" + 0.005*"graham" + 0.005*"february" + 0.004*"valley" + 0.004*"cj_disabledvet" + 0.004*"hell" + 0.004*"gold" + 0.004*"weather" + 0.004*"pm"'),
 (4,
  '0.012*"safetypindaily" + 0.009*" " + 0.004*"like" + 0.003*"red" + 0.003*"season" + 0.003*"ll" + 0.002*"daddy" + 0.002*"game" + 0.002*"canada" + 0.002*"redtraccoon"'),
 (5,
  '0.010*"krassenstein" + 0.008*" " + 0.008*"realdonaldtrump" + 0.007*"edkrassen" + 0.005*"irdotnet" + 0.005*"dwyer" + 0.005*"

## Try LSA

In [204]:
lsa = TruncatedSVD(10, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(counts)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)


In [205]:
lsa.explained_variance_ratio_


array([0.00981943, 0.0043201 , 0.00347297, 0.00310564, 0.00301853,
       0.00270269, 0.00267642, 0.00246245, 0.00231297, 0.00224262])

In [206]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [207]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
launching, fouryearold, lordthey, law, lesson, lawed, leahr, freepressfree, launderers, hounddog

Topic  1
destroyiny, detention, dustup, nordic, non, noble, applesgrapes, cohort, conkling, fit

Topic  2
moments, glue, mirror, middleclass, mucha, material, marine, fringe, andrews, mrcalliet

Topic  3
austrias, authorized, auth, authorpmbarrett, attest, aurora, audition, averting, ateam, attract

Topic  4
experienced, cases, cdc, exposure, explode, cavs, categorized, attitude, atlanticresolve, esa

Topic  5
cape, canalearly, callin, branding, bowling, cruel, boycotting, cj_disabledvet, churchs, classless

Topic  6
divertidos, djb, boydmatheson, braves, esa, brady, ericboehlert, erikhalvorsen, cdc, explode

Topic  7
documentary, edgerton, documentaries, community, directs, dip, earl, cmcl, duh, departmentthey

Topic  8
doorwomen, amy_rosenow, educationlibs, conclusion, flavors, concretepark, fantasy, cinco, chrishodgeman, chiefilliniwek

Topic  9
bonhams, crusaders, boom, borou