In [13]:
from gensim import corpora, models, similarities
import os
import json



## Tutorial from Gensim

https://radimrehurek.com/gensim/tut1.html

In [14]:
# Load target file and supporting documents

home_dir = "/Users/christopherallison/.virtualenvs/py_twi/streaming_results"

target = 'harper_stream.json'

documents = []

with open(os.path.join(home_dir, target), 'r') as f:
        
    for data in f:
        
        result = json.loads(data)
        
        documents.append(result['text'])

In [20]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

# Remove words that only occur once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint
pprint(texts)

[['rt', 'but', 'how', 'about', 'but', '…'],
 ['rt',
  '@justintrudeau:',
  'stop',
  'stephen',
  'harper,',
  'we',
  'need',
  'you',
  'vote.',
  'find',
  'your',
  'advance',
  'polling',
  'location:',
  'http://t.co/rtszotp9cc',
  'http://t.co/gofo…'],
 ['vote', 'abc'],
 ['@1planet_people',
  'your',
  'an',
  'idiot.',
  'harper',
  'best',
  'prime',
  'minister',
  'last',
  '40',
  'yrs.',
  'your',
  'regressive',
  'liberal',
  'make',
  'no',
  'sense'],
 ['rt',
  'check',
  'out',
  '@sonamakapoor',
  '@beingsalmankhan',
  "harper's",
  'bazaar',
  'bride'],
 ['i', 'just', 'this', 'please', 'join', 'via', '@leadnowca'],
 ["it's", 'you', 'we', 'have', 'is', 'what'],
 ['rt',
  'so',
  'happy',
  'you',
  'voted!',
  'this',
  'early',
  'turnout',
  'bodes',
  'well',
  'canada!',
  "can't",
  'wait',
  'see',
  'back',
  '&amp;',
  'bring',
  'back',
  'our',
  'de…'],
 ['harper', 'ex'],
 ['rt',
  '@vincenlair:',
  'je',
  'pense',
  'aller',
  'voter',
  'le',
  'visage'

In [21]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/tweets.dict')
print(dictionary)

Dictionary(469 unique tokens: ['ads.', '@abcveterans2015:', 'htt…', 'hates', 'read']...)


In [22]:
print(dictionary.token2id)

{'ads.': 233, '@abcveterans2015:': 306, 'htt…': 180, 'hates': 451, 'read': 174, 'us': 173, 'people': 184, 'since': 454, 'grey.': 456, 'anything': 420, 'country.': 316, 'ahead.': 318, '#c…': 416, 'reach': 129, 'gets': 297, 'at': 121, 'hockey': 352, 'cover-up': 463, 'again': 249, 'investigated': 422, 'dénonce': 78, 'coffin': 230, '@acoyne': 202, 'during': 119, 'hair': 455, '#yegcentre,': 391, 'yet': 250, 'lose': 399, 'government': 181, 'them.': 375, 'divide': 315, 'within': 128, '#heavesteve': 450, 'kidding.': 139, 'https://t.co/id3aqzq2ha': 296, 'liberal': 31, 'couvert': 84, 'know': 216, 'management.': 361, 'on': 178, 'defeat': 446, 'spew': 401, '1': 407, 'seat': 410, 'maybe': 204, 'victory': 302, 'an': 25, 'political': 368, 'conservative': 345, 'u': 412, '4': 280, 'bargained': 156, '@sonamakapoor': 39, 'get': 111, 'him': 113, 'say': 106, 'canadian': 163, 'n': 405, 'report': 253, 'election.': 464, 'let': 110, 'pm': 272, 'canada': 247, 'matching': 149, 'my': 143, '-&gt;': 194, 'behind': 

In [24]:
new_documents = []

with open(os.path.join(home_dir,'trudeau_stream.json')) as f:
    for data in f:
        
        result = json.loads(data)
        
        new_documents.append(result['text'])
    

In [26]:
new_vec = [dictionary.doc2bow(new_document.lower().split())
           for new_document in new_documents]

In [27]:
print(new_vec[0])

[(2, 1), (25, 1), (33, 1), (52, 1), (216, 1), (217, 1), (220, 1), (243, 1), (247, 1), (378, 1), (404, 1)]


In [70]:
class MyCorpus(object):
    def __init__(self, target):
        self.target = target
    
    def __iter__(self):
        for line in open(os.path.join(home_dir, target)):
            result = json.loads(line)
            yield dictionary.doc2bow(line.lower().split())

In [71]:
corpus_memory_friendly = MyCorpus('trudeau_stream.json')

In [72]:
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x1084bf9b0>


In [73]:
for vector in corpus_memory_friendly:
    print(vector)

[(0, 4), (1, 2), (3, 2), (69, 1), (82, 1), (117, 1), (125, 1), (163, 1), (190, 1), (245, 2), (301, 1), (330, 2)]
[(5, 2), (6, 1), (7, 2), (8, 2), (11, 2), (12, 2), (13, 2), (14, 2), (15, 2), (16, 2), (17, 2), (18, 2), (19, 2), (53, 1), (76, 1), (79, 1), (118, 1), (121, 1), (137, 2), (224, 1), (232, 1), (245, 2), (303, 1), (330, 2), (388, 1)]
[(1, 1), (7, 1), (21, 1), (54, 1), (98, 1), (99, 2), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (108, 1), (109, 1), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (116, 1), (117, 1), (173, 1), (245, 2), (330, 1)]
[(13, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (245, 1)]
[(1, 1), (37, 2), (38, 1), (39, 2), (40, 2), (41, 2), (42, 2), (43, 2), (56, 1), (245, 2)]
[(44, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (110, 1), (167, 1), (178, 1), (201, 1), (245, 1), (330, 1)]
[(7, 1), (19, 1), (30, 2), (31, 1), (43, 1), (46, 1), (51, 2), (52, 1)

In [41]:
corpora.MmCorpus.serialize('/tmp/harper.mm', corpus_memory_friendly)

In [43]:
tfidf = models.TfidfModel(corpus_memory_friendly)

In [44]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7280147807344828), (1, 0.6855614334486175)]


In [45]:
corpus_tfidf = tfidf[corpus_memory_friendly]

In [46]:
for doc in corpus_tfidf:
    print(doc)

[(0, 0.6149952163571941), (1, 0.2895662376967936), (3, 0.48974989702901883), (69, 0.22469815992958578), (82, 0.2733125348233778), (117, 0.1319958656332269), (125, 0.1604334519420953), (163, 0.19626057362071742), (190, 0.22469815992958578), (301, 0.19626057362071742), (330, 0.04691291718393414)]
[(5, 0.2579858343345365), (6, 0.13949777790667178), (7, 0.13980199811615412), (8, 0.3257790688813821), (11, 0.13540101650240843), (12, 0.2918824516079593), (13, 0.17257645968800525), (14, 0.24917776428469365), (15, 0.26783241891384707), (16, 0.3257790688813821), (17, 0.3257790688813821), (18, 0.3257790688813821), (19, 0.20003918436700147), (53, 0.04570998925808034), (76, 0.10764057350563543), (79, 0.12060494786232677), (118, 0.10240879514960625), (121, 0.09357614428619287), (137, 0.22104890584580855), (224, 0.14594122580397964), (232, 0.12899291716726824), (303, 0.08008432227142862), (330, 0.027959285673757907), (388, 0.14594122580397964)]
[(1, 0.13360443226038718), (7, 0.10823125367703144), (21

In [53]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)

In [54]:
corpus_lsi = lsi[corpus_tfidf]

In [55]:
lsi.print_topics(2)

['0.176*"not" + 0.170*"you" + 0.170*"is" + 0.164*"on" + 0.162*"harper" + 0.147*"my" + 0.146*"stephen" + 0.145*"have" + 0.144*"i" + 0.144*"#elxn42"',
 '0.312*"check" + 0.307*"|" + 0.265*"sweary," + 0.265*"accurate" + 0.265*"report" + 0.254*"angry" + 0.253*"be" + 0.249*"idiot." + 0.247*"yet" + 0.246*"fucking"']

In [56]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.18752961869475435), (1, -0.088852408672850908), (2, 0.03322281030190187), (3, -0.019603435899490342), (4, 0.028639248423830522), (5, -0.044754104874815785), (6, 0.01553417011482125), (7, -0.11794967415817087), (8, 0.042335724066109436), (9, -0.12171620729596146), (10, 0.064409478897927031), (11, -0.13904398081476094), (12, 0.025178426456792826), (13, -0.062696802386713518), (14, 0.01040337623106633), (15, 0.1143568165721409), (16, -0.11182264595081581), (17, 0.036867571285630819), (18, -0.0095756357664235452), (19, 0.0067527260210240156), (20, -0.03087083407210825), (21, 0.10360915528200335), (22, -0.015989655519532849), (23, 0.094452105798811747), (24, -0.011606700359981322), (25, 0.011921491965819025), (26, -0.046404331126884267), (27, 0.063594433714442261), (28, -0.10177607141484493), (29, 0.133183954048743), (30, -0.11093025203432727), (31, 0.1977247237205805), (32, -0.070239815495669336), (33, 0.024690431051703651), (34, -0.15047979330255096), (35, 0.13920911768854266), (36

In [57]:
lsi.save('/tmp/model.lsi')
lsi = models.LsiModel.load('/tmp/model.lsi')

In [58]:
lsi

<gensim.models.lsimodel.LsiModel at 0x108444908>

In [61]:
lsi.print_topics(10)

['0.176*"not" + 0.170*"you" + 0.170*"is" + 0.164*"on" + 0.162*"harper" + 0.147*"my" + 0.146*"stephen" + 0.145*"have" + 0.144*"i" + 0.144*"#elxn42"',
 '0.312*"check" + 0.307*"|" + 0.265*"sweary," + 0.265*"accurate" + 0.265*"report" + 0.254*"angry" + 0.253*"be" + 0.249*"idiot." + 0.247*"yet" + 0.246*"fucking"',
 '-0.234*"campaign" + -0.207*"trudeau" + 0.199*"-" + -0.180*"against" + -0.170*"stephen" + -0.163*"by" + -0.146*"polling" + -0.146*"location:" + -0.146*"advance" + -0.146*"vote."',
 '-0.254*"conservative" + -0.250*"against" + 0.214*"trudeau" + -0.195*"#cdnpoli" + 0.184*"harper," + -0.182*"vote" + 0.177*"location:" + 0.177*"vote." + 0.177*"advance" + 0.177*"polling"',
 '-0.315*"-" + -0.182*"harper." + -0.182*"go" + -0.173*"ads" + -0.173*"collapses" + -0.173*"brand" + -0.173*"desperate" + 0.170*"have" + -0.164*"say" + -0.161*"|"',
 '0.311*"|" + 0.290*"have" + 0.241*"make" + 0.236*"my" + 0.214*"@minjoeoliver" + 0.210*"everything" + 0.196*"@go4darrell:" + 0.183*"you" + -0.151*"not" + 