# Sentiment causal analyses based on movie reviews (topic modeling)
## Based on 50,000 labeled IMDb movie reviews [dataset](http://ai.stanford.edu/~amaas/data/sentiment/)

# Import necessary dependencies

In [1]:
import pandas as pd
import numpy as np

# Python files with functions
import text_normalizer as tn
import model_evaluation_utils as meu
# ~Python files

np.set_printoptions(precision=2, linewidth=80)

# Load, split (train, test) and normalize the data

In [3]:
# Use subset of the data for better performance
train_size = 500
test_size = 500

In [4]:
dataset = pd.read_csv('movie_reviews.csv').sample(frac=1).reset_index(
    drop=True)

reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# train sets
train_reviews = reviews[:train_size]
train_sentiments = sentiments[:train_size]

# test sets
test_reviews = reviews[train_size:train_size + test_size]
test_sentiments = sentiments[train_size:train_size + test_size]

# normalized reviews
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

# peek at data
print(dataset.head(1))

                                              review sentiment
0  There is so much not to like about this show i...  negative


# Extract features from positive and negative reviews

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews + norm_test_reviews
# get tfidf features for positive reviews
positive_reviews = [
    review for review, sentiment in zip(norm_reviews, sentiments)
    if sentiment == 'positive'
]
ptvf = TfidfVectorizer(
    min_df=0.05,
    max_df=0.95,
    ngram_range=(1, 1),
    sublinear_tf=True,
    use_idf=True,
    binary=False)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tfidf features for negative reviews
negative_reviews = [
    review for review, sentiment in zip(norm_reviews, sentiments)
    if sentiment == 'negative'
]
ntvf = TfidfVectorizer(
    min_df=0.05,
    max_df=0.95,
    ngram_range=(1, 1),
    sublinear_tf=True,
    use_idf=True,
    binary=False)
ntvf_features = ntvf.fit_transform(negative_reviews)

print('Positive features shape: ', ptvf_features.shape,
      ' Negative features shape: ', ntvf_features.shape)

Positive features shape:  (498, 342)  Negative features shape:  (502, 337)


+ min_df and max_df parameters were set to get rid of features that occur too often or too rarely and to speed up the process

In [13]:
import pyLDAvis # for building interactive visualizations of topic models
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

pyLDAvis.enable_notebook()
total_topics = 10

## Display and visualize topics for positive reviews

In [66]:
def my_display_topics(weights_matrix, pos_feature_names, words=15):
    weights_to_names = [list(zip(weights,np.array(pos_feature_names).flatten())) for weights in weights_matrix]
    weights_to_names = [sorted(row,key= lambda x: x[0],reverse=True) for row in weights_to_names]
    for row in weights_to_names:
        stacked_weights_to_names_row = np.vstack(row)
        print(stacked_weights_to_names_row[:words,1])

In [58]:
# build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics, 
          random_state=42, alpha=0.1, l1_ratio=0.2)
pos_nmf.fit(ptvf_features)
# extract features and component weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_

# extract and display topics and their components
pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)
tmu.print_topics_udf(topics=pos_topics,
                 total_topics=total_topics,
                 num_terms=15,
                 display_weights=False)
# or

# my_display_topics(pos_weights,pos_feature_names)

Topic #1 without weights
['man', 'life', 'young', 'no', 'come', 'woman', 'know', 'take', 'become', 'one', 'family', 'get', 'kill', 'new', 'wife']

Topic #2 without weights
['movie', 'see', 'watch', 'good', 'not', 'go', 'like', 'time', 'want', 'think', 'get', 'never', 'one', 'year', 'first']

Topic #3 without weights
['role', 'play', 'performance', 'well', 'film', 'cast', 'good', 'excellent', 'give', 'actor', 'year', 'also', 'great', 'superb', 'work']

Topic #4 without weights
['series', 'episode', 'show', 'tv', 'watch', 'not', 'dvd', 'first', 'something', 'air', 'great', 'get', 'boy', 'everyone', 'could']

Topic #5 without weights
['funny', 'comedy', 'laugh', 'character', 'make', 'show', 'not', 'movie', 'great', 'good', 'support', 'enjoy', 'girl', 'script', 'lot']

Topic #6 without weights
['character', 'story', 'interesting', 'not', 'beautiful', 'wonderful', 'give', 'rather', 'something', 'relationship', 'viewer', 'lead', 'also', 'life', 'actor']

Topic #7 without weights
['love', 'fi

In [55]:
pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Display and visualize topics for negative reviews

In [64]:
# build topic model on positive sentiment review features
neg_nmf = NMF(n_components=total_topics, 
          random_state=42, alpha=0.1, l1_ratio=0.2)
neg_nmf.fit(ntvf_features)
# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_

# extract and display topics and their components
neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)
tmu.print_topics_udf(topics=neg_topics,
                 total_topics=total_topics,
                 num_terms=15,
                 display_weights=False)
# or

# my_display_topics(neg_weights,neg_feature_names)

Topic #1 without weights
['film', 'may', 'minute', 'though', 'two', 'come', 'lead', 'well', 'one', 'time', 'start', 'around', 'man', 'director', 'take']

Topic #2 without weights
['movie', 'bad', 'could', 'watch', 'good', 'not', 'see', 'say', 'ever', 'first', 'make', 'thing', 'even', 'think', 'get']

Topic #3 without weights
['show', 'kid', 'back', 'guy', 'get', 'go', 'watch', 'old', 'think', 'day', 'want', 'look', 'use', 'really', 'like']

Topic #4 without weights
['life', 'story', 'make', 'man', 'movie', 'not', 'love', 'many', 'hour', 'time', 'get', 'family', 'know', 'no', 'like']

Topic #5 without weights
['character', 'really', 'seem', 'like', 'lot', 'not', 'care', 'version', 'scene', 'people', 'main', 'much', 'find', 'also', 'lack']

Topic #6 without weights
['original', 'series', 'sequel', 'story', 'nothing', 'fan', 'part', 'credit', 'never', 'already', 'good', 'still', 'even', 'line', 'least']

Topic #7 without weights
['comedy', 'joke', 'funny', 'laugh', 'bad', 'not', 'woman', 

In [63]:
pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, R=15)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
