In [1]:
#####################
#     LOAD DATA     #
#####################

import pandas as pd

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d
                    

tweets = pd.read_csv("/home/simi/projects/robot/senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))

num tweets: 449334


In [15]:
import spacy
from __future__ import unicode_literals

nlp = spacy.load('en_core_web_md')

def tokenize(text):
    lda_tokens = []
    tokens = nlp(unicode(text, 'utf-8'))
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        elif token.orth_.startswith('.@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens
    

import random

docs = []
hashtags = []
N = len(tweets) #25000
rand_tweets = random.sample(range(len(tweets)), k=N)
tweets_tb = tweets.values
for i, tw in enumerate(rand_tweets):
    if i % 1000 == 0:
        print('{}% '.format(100./N*i))
    text = tweets_tb[i][1]
    tokens = prepare_text_for_lda(text)
    if random.random() > .999:
        print(tokens)
    docs.append(tokens)
    tags = tweets_tb[i][0]
    tags = unicode(tags, 'utf-8')
    taggs = tags.replace('[', '').replace(']', '').replace('\'', '').split(",")
    hashtags.append([t.strip() for t in taggs])    

[nltk_data] Downloading package wordnet to /home/simi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.0% 
0.222551598588% 
[u'SCREEN_NAME', u'thank', u'woman', u'every', u'member', u'military', u'abroad']
[u'please', u'looking', u'solareclipse2017', u'today', u'necessary', u'safety', u'glasses']
[u'place', u'represent', u'conservation', u'space', u'enjoy', u'hunting', u'fishing', u'recreate']
0.445103197176% 
[u'great', u'SCREEN_NAME', u'story', u'folks', u'others', u'hero', u'selflesssaturday']
[u'blackfriday', u'shopping', u'today', u'forget', u'around', u'health', u'insurance', u'enrollment', u'december', u'visit', u'deadline', u'explore', u'option']
0.667654795764% 
0.890206394353% 
[u'SCREEN_NAME', u'SCREEN_NAME', u'beginning', u'process', u'refund', u'thousand', u'veteran', u'tax', u'disability', u'severance', u'please']
[u'SCREEN_NAME', u'thank', u'SCREEN_NAME', u'leadership', u'protectourcoast', u'offshore', u'drilling']
[u'icymi', u'SCREEN_NAME', u'publiclands', u'essential', u'economy']
[u"intro'd", u'SCREEN_NAME', u'ensure', u'arizona', u'border', u'state', u'reimburse', u

[u'boonville', u'site', u'SCREEN_NAME', u'national', u'register', u'historic', u'place', u'missourispotlight']
15.8011634998% 
16.0237150983% 
[u'SCREEN_NAME', u'provisions', u'work', u'together', u'strong', u'going', u'unprecedented', u'mstreforms']
[u'death', u'rise', u'afghanistan', u'still', u'strategy', u'defeat', u'terrorism']
[u'SCREEN_NAME', u'obamacare', u'premium', u'savvy', u'shopper']
[u'SCREEN_NAME', u'russia', u'despite', u'shipping', u'syria', u'remove']
16.2462666969% 
[u'pray', u'pilot', u'crash', u'bagdad', u'arizona', u'appreciate', u'effort', u'first', u'responder', u'SCREEN_NAME', u'entire', u'community']
[u'SCREEN_NAME', u'decline', u'antiterror', u'surveillance', u'paris', u'reopen', u'debate', u'intelligence', u'collection']
16.4688182955% 
16.6913698941% 
[u'SCREEN_NAME', u'inconvenient', u'truth', u'help', u'spark', u'decade', u'climateaction', u'ait10']
[u'SCREEN_NAME', u'surgeon', u'general', u'congress', u'gunviolence', u'research', u'working', u'SCREEN_NAM

29.154259415% 
[u'years', u'price', u'solar', u'system', u'drop', u'cleanenergy']
29.3768110136% 
29.5993626122% 
[u'SCREEN_NAME', u'usmnt', u'worldcup2014']
[u'excite', u'kickoff', u'forest', u'football', u'tonight', u'deacs', u'ncsen', u'ncgop']
29.8219142108% 
[u'deadline', u'health', u'start', u'1/1/2015', u'today', u'getcovered']
[u'SCREEN_NAME', u'working', u'district', u'great', u'partner', u'ohioan']
30.0444658094% 
30.267017408% 
[u'SCREEN_NAME', u'anyone', u'want', u'leave', u'voice', u'message', u'SCREEN_NAME', u'filibuster']
30.4895690066% 
30.7121206052% 
30.9346722038% 
[u'watch', u'today', u'event', u'victim', u'violence', u'newtown', u'alliance', u'honorwithaction', u'endgunviolence']
31.1572238023% 
31.3797754009% 
[u'ozone', u'proof', u'obama', u'admin', u'turning', u'american', u'washington', u'focus']
[u'president', u'going', u'follow', u'science', u'politics', u'keystonexl']
31.6023269995% 
31.8248785981% 
32.0474301967% 
32.2699817953% 
32.4925333939% 
[u'trump', 

44.0652165205% 
44.287768119% 
[u'senator', u'hatch', u'speaking', u'floor', u'momentarily', u'nomination', u'judge', u'brett', u'kavanaugh', u'unite', u'state', u'supreme', u'court', u'utpol']
44.5103197176% 
44.7328713162% 
[u'SCREEN_NAME', u'welcome', u'nomination', u'SCREEN_NAME', u'judge', u'serve', u'district', u'judge', u'idaho', u'SCREEN_NAME', u'legal']
44.9554229148% 
45.1779745134% 
[u'SCREEN_NAME', u'vote', u'repeal', u'netneutrality', u'allow', u'internet', u'service', u'provider', u'internet', u'uneven', u'playing', u'field', u'lane', u'lane', u'others', u'working', u'savetheinternet', u'restore', u'netneutrality']
45.400526112% 
45.6230777106% 
[u'tbt--', u'senator', u'hatch', u'justice', u'bad', u'ginsburg', u'discuss', u'upcoming', u'confirmation', u'process', u'hatch', u'ranking', u'member', u'judiciary', u'committee', u'justice', u'ginsburg', u'would', u'confirm']
45.8456293092% 
46.0681809077% 
[u'exactly', u'since', u'SCREEN_NAME', u'slash', u'bearsears', u'grandst

58.3085188301% 
[u'today', u'pass', u'make', u'meaningful', u'investment', u'transportation', u'infrastructure', u'mtpol']
[u'forward', u'meeting', u'judge', u'garland', u'review', u'credentials', u'ensure', u'uphold', u'constitution', u'scotusnominee', u'mtpol']
58.5310704287% 
58.7536220273% 
[u'join', u'SCREEN_NAME', u'capitol', u'assure', u'alaskan', u'despite', u'little', u'office', u'still']
58.9761736259% 
59.1987252244% 
[u'speaking', u'grappone', u'toyota', u'energyefficiency', u'fast', u'cheap', u'energy', u'need']
59.421276823% 
59.6438284216% 
[u'SCREEN_NAME', u'sensessions', u'deserve', u'hearing', u'deeply', u'concern', u'regard', u'record', u'civil', u'disability', u'right']
[u'SCREEN_NAME', u'fight', u'pinksourcing', u'SCREEN_NAME', u'paycheckfairness', u'doyourjob']
59.8663800202% 
[u'SCREEN_NAME', u'video', u'leahy', u'meet', u'SCREEN_NAME', u'merrick', u'garland', u'doyourjob']
60.0889316188% 
[u'enjoy', u'weekend', u'starkey', u'SCREEN_NAME', u'vineyard', u'SCREEN_N

[u'happy', u'lunarnewyear', u'coloradan', u'american', u'celebrate', u'festive', u'holiday', u'everyone', u'prosperous']
72.7743727383% 
72.9969243369% 
73.2194759355% 
[u'today', u'SCREEN_NAME', u'hearing', u'testimony', u'secdef', u'strategy', u'defeat', u'watch']
73.4420275341% 
[u'believe', u'grave', u'mistake', u'build', u'pipeline', u'little', u'increase', u'energy', u'security', u'nodapl', u'nokxl']
[u'america', u'beautiful', u'information', u'event', u'americarecyclesday']
[u'SCREEN_NAME', u'august', u'celebrate', u'100th', u'anniversary', u'national', u'service', u'\u2014@potus', u'findyourpark']
73.6645791327% 
73.8871307313% 
[u'disclose', u'president', u'campaign', u'leaders', u'opponent', u'disclose', u'friend', u'leaf', u'union']
74.1096823298% 
[u'morning', u'mukasey&amp;chair', u'SCREEN_NAME', u'enforcement', u'cjreform']
74.3322339284% 
[u'frighten', u'polio', u'nuclear', u'weapon', u'everyone', u'worldpolioday']
[u'opportunity', u'speak', u'SCREEN_NAME', u'aap15', u'r

86.1274686536% 
[u'corlyn', u'nurse', u'practitioner', u'eagle', u'point', u'firsthand', u'lifeline', u'oregonian', u'saveourcare', u'oregonacastory']
86.3500202522% 
[u'years', u'today', u'ussnautilus', u'ssn-571', u'become', u'first', u'watercraft', u'reach', u'geographic', u'north', u'delight', u'submarine', u'force', u'museum', u'groton', u'commemorate', u'event', u'learn']
[u'senator', u'hatch', u'speaking', u'floor', u'immigration', u'proposal', u'isquared', u'immigrationinnovation']
86.5725718508% 
86.7951234494% 
[u'appropriation', u'chair', u'patrick', u'leahy', u'senate', u'floor', u'afternoon', u'trumpshutdown', u'leadership', u'chaos', u'inability', u'govern', u'president', u'exactly', u'want', u'trumpshutdown', u'speech']
87.017675048% 
87.2402266465% 
[u'SCREEN_NAME', u'SCREEN_NAME', u'simply', u'would', u'answer', u'question', u'pushing', u'showusthebill']
87.4627782451% 
87.6853298437% 
[u'report', u'outline', u'specific', u'guideline', u'ensure', u'secret', u'deal', u'

[u'coming', u'netneutrality', u'going', u'onemorevote']
[u'trumpbudget', u'zero', u'funding', u'effort', u'clean', u'america', u'important', u'body', u'water', u'chesapeake']
[u'working', u'protect', u'lgbtq', u'individual', u'discrimination', u'education', u'housing', u'credit', u'employment', u'hrcladinner']
98.367806576% 
[u'james', u'madison', u'young', u'delegate', u'continental', u'congress', u'philadelphia', u'vaisforpresidents']
[u'head', u'npcbee', u'phone', u'friend', u'tonight', u'tejas', u'speed', u'npclive']
98.5903581745% 
98.8129097731% 
99.0354613717% 
[u'SCREEN_NAME', u'motivate', u'incident', u'affront', u'share', u'values', u'SCREEN_NAME', u'nohateact', u'strengthen', u'federal']
99.2580129703% 
[u'great', u'discussion', u'group', u'representative', u'world', u'class', u'producer', u'today', u'thank', u'SCREEN_NAME', u'joining', u'conversation', u'nebraskaag', u'farmbill18', u'trade', u'afternoon', u'feedtheworld']
99.4805645689% 
[u'numbers', u'corporation', u'reap'

In [39]:
s = dict()
for tags in hashtags:
    for t in tags:
        if t not in s:
            s[t] = 0
        s[t] += 1
num_tags = list()
for tag, num in s.items():
    num_tags.append((num, tag))
num_tags.sort(reverse=True)

ok_tags = set()
for num, tag in num_tags[:6000]:
    ok_tags.add(tag)

In [41]:
num_lines = 0
with open('my_training_file_{}.txt'.format(N), 'w') as trainer_file:
    for i, doc in enumerate(docs):
        if i > len(docs)*0.9:
            break
            
        for tag in hashtags[i]:
            if tag not in ok_tags:
                continue
            val = '\t'.join([tag, ' '.join(doc)])
            trainer_file.write(val.encode('utf8'))
            trainer_file.write('\n')
            num_lines += 1
            
print(num_lines, " written")
    

(467580, u' written')


In [36]:
with open('my_training_file_25000.txt', 'w') as trainer_file:
    for i, doc in enumerate(docs):
        if i < 1000:
            continue
        for tag in hashtags[i]:
            val = '\t'.join([tag, ' '.join(doc)])
            trainer_file.write(val.encode('utf8'))
            trainer_file.write('\n')

In [44]:
num_lines = 0

with open('my_testing_file_{}.txt'.format(N), 'w') as trainer_file:
    for i, doc in enumerate(docs):
        if i < len(docs)*0.9:
            continue
        for tag in hashtags[i]:
            if tag not in ok_tags:
                continue
            val = '\t'.join([tag, ' '.join(doc)])
            trainer_file.write(val.encode('utf8'))
            trainer_file.write('\n')
            num_lines += 1
            
print(num_lines, " written")


(53100, u' written')


In [49]:
docs_test = []
hashtags_test = []
N = 100
rand_tweets = random.sample(range(len(tweets)), k=N)
for i, tw in enumerate(rand_tweets):
    text = tweets.iloc[i]['text']
    tokens = prepare_text_for_lda(text)
    docs_test.append(tokens)
    tags = tweets.iloc[i]['hashtags']
    tags = unicode(tags, 'utf-8')
    taggs = tags.replace('[', '').replace(']', '').replace('\'', '').split(",")
    hashtags_test.append([t.strip() for t in taggs])    

In [50]:
with open('my_testing_file.txt', 'w') as trainer_file:
    for i, doc in enumerate(docs):
            val = '\t'.join([','.join(hashtags[i]), ' '.join(doc)])
            trainer_file.write(val.encode('utf8'))
            trainer_file.write('\n')