In [2]:
import pandas as pd
import numpy as np

import re
import string  ## Punctuation Removal Library

import gensim
from gensim import corpora

##Libraries for Visualization
import seaborn as sns
import matplotlib.pyplot as plt 

In [3]:
link = 'C:\\Python Software\\Sentiment Analysis'

data = pd.read_csv(link + '\\' + 'Train review within textblob.csv')
data.head()

Unnamed: 0,data,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


# Explanation for clean_text function

In [4]:
delete_dict = {sp_character: '' for sp_character in string.punctuation}
delete_dict[' '] = ' '

table = str.maketrans(delete_dict) ##Transfer punctations into the numeric value
text1 = str(data['data']).translate(table)

textArr= text1.split()
text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 

print(delete_dict)
print(table)
print(text1)
print(textArr)
print(text2.lower())

{'!': '', '"': '', '#': '', '$': '', '%': '', '&': '', "'": '', '(': '', ')': '', '*': '', '+': '', ',': '', '-': '', '.': '', '/': '', ':': '', ';': '', '<': '', '=': '', '>': '', '?': '', '@': '', '[': '', '\\': '', ']': '', '^': '', '_': '', '`': '', '{': '', '|': '', '}': '', '~': '', ' ': ' '}
{33: '', 34: '', 35: '', 36: '', 37: '', 38: '', 39: '', 40: '', 41: '', 42: '', 43: '', 44: '', 45: '', 46: '', 47: '', 58: '', 59: '', 60: '', 61: '', 62: '', 63: '', 64: '', 91: '', 92: '', 93: '', 94: '', 95: '', 96: '', 123: '', 124: '', 125: '', 126: '', 32: ' '}
0        Bromwell High is a cartoon comedy It ran at t
1        Homelessness or Houselessness as George Carli
2        Brilliant overacting by Lesley Ann Warren Be
3        This is easily the most underrated film inn th
4        This is not the typical Mel Brooks film It wa
                                                       
24995    Towards the end of the movie I felt it was to
24996    This is the kind of movie that my e

In [5]:
def clean_text(text): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [6]:
data['data'] = data['data'].apply(clean_text)

data.data

0        bromwell high cartoon comedy same time some ot...
1        homelessness houselessness george carlin state...
2        brilliant overacting lesley warren best dramat...
3        this easily most underrated film brooks cannon...
4        this typical brooks film much less slapstick t...
                               ...                        
24995    towards movie felt technical felt like classro...
24996    this kind movie that enemies content watch tim...
24997    descent last night stockholm film festival hug...
24998    some films that pick pound turn rather good 23...
24999    this dumbest films ever seen rips nearly ever ...
Name: data, Length: 25000, dtype: object

In [7]:
from nltk.corpus import stopwords

Stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
    text1 = text.split(' ')
    text2 = ' '.join([word for word in text1 if word not in Stopwords])
    return text2

data['data'] = data['data'].apply(remove_stopwords)
data['data']

0        bromwell high cartoon comedy time programs sch...
1        homelessness houselessness george carlin state...
2        brilliant overacting lesley warren best dramat...
3        easily underrated film brooks cannon sure flaw...
4        typical brooks film much less slapstick movies...
                               ...                        
24995    towards movie felt technical felt like classro...
24996    kind movie enemies content watch time bloody t...
24997    descent last night stockholm film festival hug...
24998    films pick pound turn rather good 23rd century...
24999    dumbest films ever seen rips nearly ever type ...
Name: data, Length: 25000, dtype: object

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
    output = []
    for sent in texts:
        doc = nlp(sent) 
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
    return output

In [9]:
text_list = data['data'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

homelessness houselessness george carlin stated issue years never plan help street considered human everything going school work vote matter people think homeless lost cause worrying things racism iraq pressuring kids succeed technology elections inflation worrying theyll next streetsbr given live streets month without luxuries home entertainment sets bathroom pictures wall computer everything treasure like homeless goddard bolts lessonbr brooks directs stars bolt plays rich everything world deciding make sissy rival jeffery tambor live streets thirty days without luxuries bolt succeeds wants future project making buildings bets bolt thrown street bracelet monitor every move cant step sidewalk given nickname pepto vagrant written forehead bolt meets characters including woman name molly lesley warren exdancer divorce losing home pals sailor howard morris fumes teddy wilson already used streets theyre survivors bolt isnt used reaching mutual agreements like rich fight flight kill killed

In [10]:
dictionary = corpora.Dictionary(tokenized_reviews) ## Mapping between words and their position IDs, formatting is dictionary 
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews] # Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples

In [11]:
##Create object for LDA using gensim library

LDA = gensim.models.ldamodel.LdaModel

#Build LDA Model
lda_model = LDA(corpus = doc_term_matrix, id2word = dictionary, num_topics = 10, random_state = 10,
               chunksize = 1000, passes = 50, iterations = 100)


In [15]:
lda_model.get_topic_terms(topicid = 7) ##Given topic ID, the top word ID and word proba associated with this topic

[(6234, 0.02848127),
 (6242, 0.027664414),
 (11543, 0.019096201),
 (10002, 0.012158086),
 (9998, 0.008064832),
 (5194, 0.006314107),
 (3493, 0.006216721),
 (5378, 0.0058359713),
 (9996, 0.0055251056),
 (3451, 0.0054822913)]

In [17]:
lda_model.show_topic(topicid = 7) ##Given topic ID, the top word and word proba assocaited with this topic

[('dragon', 0.02848127),
 ('rubbish', 0.027664414),
 ('dude', 0.019096201),
 ('psychotic', 0.012158086),
 ('helen', 0.008064832),
 ('wanna', 0.006314107),
 ('joey', 0.006216721),
 ('dungeon', 0.0058359713),
 ('fishing', 0.0055251056),
 ('intentional', 0.0054822913)]

In [19]:
lda_model.show_topics(num_topics = 7, num_words = 10)

[(7,
  '0.028*"dragon" + 0.028*"rubbish" + 0.019*"dude" + 0.012*"psychotic" + 0.008*"helen" + 0.006*"wanna" + 0.006*"joey" + 0.006*"dungeon" + 0.006*"fishing" + 0.005*"intentional"'),
 (3,
  '0.018*"cage" + 0.017*"sean" + 0.016*"jerry" + 0.014*"forgettable" + 0.009*"eddie" + 0.009*"drivel" + 0.009*"murphy" + 0.007*"photographer" + 0.007*"flop" + 0.006*"civil"'),
 (5,
  '0.060*"film" + 0.013*"character" + 0.011*"story" + 0.007*"time" + 0.007*"scene" + 0.007*"good" + 0.006*"director" + 0.006*"life" + 0.005*"many" + 0.005*"people"'),
 (4,
  '0.030*"series" + 0.027*"show" + 0.022*"episode" + 0.013*"action" + 0.008*"alien" + 0.008*"original" + 0.008*"star" + 0.007*"season" + 0.006*"soldier" + 0.006*"cartoon"'),
 (6,
  '0.020*"moore" + 0.014*"brook" + 0.011*"logical" + 0.011*"leslie" + 0.010*"clothing" + 0.007*"senior" + 0.006*"policy" + 0.006*"brit" + 0.006*"moneybr" + 0.005*"lunch"'),
 (2,
  '0.014*"voiceover" + 0.009*"camcorder" + 0.008*"greek" + 0.007*"tail" + 0.007*"duck" + 0.006*"file"

In [11]:
lda_model.print_topics()

[(0,
  '0.095*"movie" + 0.024*"good" + 0.018*"time" + 0.013*"film" + 0.013*"thing" + 0.012*"people" + 0.011*"character" + 0.010*"scene" + 0.010*"plot" + 0.010*"bad"'),
 (1,
  '0.038*"musical" + 0.025*"version" + 0.020*"dance" + 0.020*"song" + 0.019*"jane" + 0.017*"number" + 0.014*"ugly" + 0.014*"martial" + 0.013*"art" + 0.008*"tarzan"'),
 (2,
  '0.014*"voiceover" + 0.009*"camcorder" + 0.008*"greek" + 0.007*"tail" + 0.007*"duck" + 0.006*"file" + 0.005*"witchcraft" + 0.005*"marilyn" + 0.005*"surfing" + 0.005*"banter"'),
 (3,
  '0.018*"cage" + 0.017*"sean" + 0.016*"jerry" + 0.014*"forgettable" + 0.009*"eddie" + 0.009*"drivel" + 0.009*"murphy" + 0.007*"photographer" + 0.007*"flop" + 0.006*"civil"'),
 (4,
  '0.030*"series" + 0.027*"show" + 0.022*"episode" + 0.013*"action" + 0.008*"alien" + 0.008*"original" + 0.008*"star" + 0.007*"season" + 0.006*"soldier" + 0.006*"cartoon"'),
 (5,
  '0.060*"film" + 0.013*"character" + 0.011*"story" + 0.007*"time" + 0.007*"scene" + 0.007*"good" + 0.006*"dire

In [12]:
## Measure how good the model is
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000))  # a measure of how good the model is. lower the better.


Perplexity:  -9.706454113010418


In [1]:
#Save the model
import pickle
model = 'LDA_10_topics.pkl'

with open(model, 'wb') as f:  
    pickle.dump(lda_model, f)

NameError: name 'lda_model' is not defined

In [20]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis