In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import gensim

In [3]:
train_path = "data/movie/labeledTrainData.tsv"

In [4]:
train = pd.read_csv(train_path, header=0, delimiter="\t", quoting=3)

In [5]:
def review_to_words(raw_review):
    #Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    
    #Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    #Remove stopwords
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stops]
    
    #Join the words back into one string sperated by a space
    return (" ".join(meaningful_words))

In [6]:
train["clean_reviews"] = train["review"].map(review_to_words)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [7]:
train

Unnamed: 0,id,sentiment,review,clean_reviews
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment mj ve started listening mus...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy wondrously unpretentious explo...
5,"""8196_8""",1,"""I dont know why people think this is such a b...",dont know people think bad movie got pretty go...
6,"""7166_2""",0,"""This movie could have been very good, but com...",movie could good comes way short cheesy specia...
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm...",watched video friend house m glad waste money ...
8,"""319_1""",0,"""A friend of mine bought this film for £1, and...",friend mine bought film even grossly overprice...
9,"""8713_10""",1,"""<br /><br />This movie is full of references....",movie full references like mad max ii wild one...


In [8]:
freq = defaultdict(int)
for review in train['clean_reviews']:
    for word in review.strip().split():
        freq[word] += 1

In [9]:
texts = [[word for word in review.strip().split() if (freq[word] > 5 and freq[word] < 1000000)] for review in train['clean_reviews']]

In [10]:
dictionary = gensim.corpora.Dictionary(texts)

In [11]:
dictionary.save('data/movie/train.dict')

In [12]:
class MyCorpus(object):
    def __iter__(self):
        for review in train['clean_reviews']:
            yield dictionary.doc2bow(review.strip().split())

In [13]:
for vector in MyCorpus():
    print(vector)
    break

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 2), (14, 1), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 3), (24, 1), (25, 1), (26, 1), (27, 3), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 2), (34, 1), (35, 3), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 3), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 4), (79, 1), (80, 1), (81, 2), (82, 1), (83, 1), (84, 5), (85, 1), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 11), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [14]:
gensim.corpora.MmCorpus.serialize('data/movie/train_corpus.mm', MyCorpus())

In [15]:
corp = gensim.corpora.MmCorpus('data/movie/train_corpus.mm')

In [16]:
lda_model = gensim.models.LdaModel(corp, id2word=dictionary, num_topics=50)

In [17]:
hdp_model = gensim.models.HdpModel(corp, id2word=dictionary)

In [18]:
lda_model.print_topics()

[(18,
  u'0.026*movie + 0.010*film + 0.007*like + 0.007*well + 0.006*one + 0.006*much + 0.006*characters + 0.005*character + 0.005*good + 0.005*book'),
 (11,
  u'0.019*film + 0.012*one + 0.009*jack + 0.005*football + 0.005*first + 0.004*good + 0.004*story + 0.003*time + 0.003*hotel + 0.003*code'),
 (21,
  u'0.009*story + 0.008*sinatra + 0.008*charlie + 0.007*film + 0.007*stewart + 0.007*one + 0.007*frank + 0.007*western + 0.005*also + 0.005*town'),
 (39,
  u'0.019*luke + 0.016*star + 0.012*wars + 0.007*vader + 0.006*emperor + 0.006*jabba + 0.006*han + 0.005*dentist + 0.005*rebel + 0.005*one'),
 (6,
  u'0.047*film + 0.014*story + 0.012*one + 0.008*well + 0.007*films + 0.005*much + 0.005*time + 0.005*first + 0.005*great + 0.005*characters'),
 (38,
  u'0.015*film + 0.014*jane + 0.008*version + 0.008*one + 0.007*story + 0.007*christian + 0.006*rochester + 0.006*victoria + 0.006*novel + 0.005*much'),
 (22,
  u'0.014*film + 0.006*man + 0.005*women + 0.005*also + 0.005*one + 0.004*life + 0.00

In [19]:
hdp_model.print_topics(topics=50, topn=5)

[u'topic 0: 0.014*movie + 0.013*film + 0.009*one + 0.007*like + 0.005*good',
 u'topic 1: 0.020*movie + 0.014*film + 0.009*one + 0.008*like + 0.006*good',
 u'topic 2: 0.022*movie + 0.013*film + 0.009*one + 0.008*like + 0.007*good',
 u'topic 3: 0.004*movie + 0.004*film + 0.003*one + 0.002*bad + 0.002*like',
 u'topic 4: 0.004*movie + 0.003*film + 0.002*great + 0.002*good + 0.002*like',
 u'topic 5: 0.004*film + 0.004*movie + 0.003*one + 0.002*like + 0.002*would',
 u'topic 6: 0.003*film + 0.003*movie + 0.002*one + 0.001*like + 0.001*people',
 u'topic 7: 0.003*film + 0.002*movie + 0.002*one + 0.001*good + 0.001*even',
 u'topic 8: 0.003*movie + 0.003*film + 0.001*one + 0.001*like + 0.001*well',
 u'topic 9: 0.002*movie + 0.001*film + 0.001*good + 0.001*one + 0.001*joe',
 u'topic 10: 0.002*film + 0.001*story + 0.001*one + 0.001*kelly + 0.001*iberia',
 u'topic 11: 0.002*movie + 0.001*like + 0.001*film + 0.001*celeste + 0.001*show',
 u'topic 12: 0.002*film + 0.002*movie + 0.001*raj + 0.001*one + 

In [20]:
tfidf_model = gensim.models.TfidfModel(corp)

In [21]:
class MyCorpusTfidf(object):
    def __iter__(self):
        for review in train['clean_reviews']:
            yield tfidf_model[dictionary.doc2bow(review.strip().split())]

In [22]:
gensim.corpora.MmCorpus.serialize('data/movie/train_corpus_tfidf.mm', MyCorpusTfidf())

In [23]:
corp_tfidf = gensim.corpora.MmCorpus('data/movie/train_corpus_tfidf.mm')

In [24]:
print(corp_tfidf)

MmCorpus(25000 documents, 26079 features, 2404140 non-zero entries)


In [25]:
lda_model_tfidf = gensim.models.LdaModel(corp_tfidf, id2word=dictionary, num_topics=50)

In [26]:
hdp_model_tfidf = gensim.models.HdpModel(corp_tfidf, id2word=dictionary)

In [27]:
lda_model_tfidf.print_topics()

[(27,
  u'0.003*film + 0.002*version + 0.002*musical + 0.002*story + 0.002*great + 0.002*love + 0.002*well + 0.002*also + 0.002*young + 0.002*quite'),
 (49,
  u'0.011*vampire + 0.008*dick + 0.007*foxx + 0.007*hawn + 0.006*vampires + 0.006*goldie + 0.006*tube + 0.006*net + 0.006*tooth + 0.006*twelve'),
 (32,
  u'0.011*ustinov + 0.009*iran + 0.009*files + 0.007*btk + 0.006*hughes + 0.006*outta + 0.006*banter + 0.006*iranian + 0.006*shoulder + 0.006*davies'),
 (11,
  u'0.013*baseball + 0.008*tip + 0.008*fever + 0.006*ww + 0.005*crashing + 0.005*luise + 0.005*nah + 0.004*sox + 0.004*muni + 0.004*fishburne'),
 (38,
  u'0.011*homicide + 0.010*omen + 0.009*bonham + 0.009*carter + 0.008*aunts + 0.008*karen + 0.008*lukas + 0.008*travesty + 0.007*sarcasm + 0.007*animations'),
 (41,
  u'0.009*dont + 0.006*sutherland + 0.006*mafia + 0.005*elvira + 0.005*kathryn + 0.005*paxton + 0.005*campbell + 0.005*bat + 0.005*paris + 0.005*mankind'),
 (35,
  u'0.008*leon + 0.007*puerto + 0.007*spinal + 0.007*ta

In [28]:
hdp_model_tfidf.print_topics(topics=50, topn=5)

[u'topic 0: 0.002*movie + 0.001*film + 0.001*bad + 0.001*one + 0.001*really',
 u'topic 1: 0.001*movie + 0.001*film + 0.001*bad + 0.001*good + 0.000*people',
 u'topic 2: 0.001*movie + 0.001*film + 0.000*bad + 0.000*arrow + 0.000*see',
 u'topic 3: 0.001*movie + 0.000*film + 0.000*like + 0.000*good + 0.000*us',
 u'topic 4: 0.000*movie + 0.000*film + 0.000*story + 0.000*rostov + 0.000*like',
 u'topic 5: 0.001*movie + 0.000*film + 0.000*little + 0.000*well + 0.000*throwing',
 u'topic 6: 0.000*movie + 0.000*film + 0.000*make + 0.000*like + 0.000*also',
 u'topic 7: 0.000*movie + 0.000*film + 0.000*eat + 0.000*good + 0.000*make',
 u'topic 8: 0.000*film + 0.000*panned + 0.000*layered + 0.000*movie + 0.000*heros',
 u'topic 9: 0.000*movie + 0.000*robinsons + 0.000*film + 0.000*crap + 0.000*slew',
 u'topic 10: 0.000*movie + 0.000*theodore + 0.000*bad + 0.000*film + 0.000*haskell',
 u'topic 11: 0.000*movie + 0.000*tinted + 0.000*lonette + 0.000*around + 0.000*d',
 u'topic 12: 0.000*yearning + 0.000