# TF-IDF and Cosine Similarity
Using the movie reviews dataset from Python's nltk package

In [1]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Paula\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Paula\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
from nltk.corpus import movie_reviews
print(movie_reviews.readme())

Sentiment Polarity Dataset Version 2.0
Bo Pang and Lillian Lee

http://www.cs.cornell.edu/people/pabo/movie-review-data/

Distributed with NLTK with permission from the authors.


Introduction

This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .


What's New -- June, 2004

This dataset represents an enhancement of the review corpus v1.0
described in README v1.1: it contains more reviews, and labels were
created with an improved rating-extraction system.


Citation Info 

This data was first used in Bo Pang and Lillian Lee,
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'',  Proceedings of the ACL, 2004.

@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =      

In [4]:
# Store reviews in an array
reviews=[]

for fileid in movie_reviews.fileids():
    reviews.append((movie_reviews.words(fileid)))
    
# Display the raw text of the first review
reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

**Pre-processing: lower case, remove stop words and stemming**

In [5]:
# Transform document words to lower case
for i in range(len(reviews)):
    reviews[i] = [word.lower() for word in reviews[i]]
    
reviews[0]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 '.',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 ',',
 'but',
 'his',
 'girlfriend',
 'continues',
 'to',
 'see',
 'him',
 'in',
 'her',
 'life',
 ',',
 'and',
 'has',
 'nightmares',
 '.',
 'what',
 "'",
 's',
 'the',
 'deal',
 '?',
 'watch',
 'the',
 'movie',
 'and',
 '"',
 'sorta',
 '"',
 'find',
 'out',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'a',
 'mind',
 '-',
 'fuck',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 ',',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 '.',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 ',',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 ',',
 'mess',
 'with',
 'your'

In [6]:
import string
# Remove punctuation from documents
# Create a new list of words by removing punctuation from reviews
for i in range(len(reviews)):
    reviews[i] = [word for word in reviews[i] if word not in string.punctuation]
    
reviews[0]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'to',
 'see',
 'him',
 'in',
 'her',
 'life',
 'and',
 'has',
 'nightmares',
 'what',
 's',
 'the',
 'deal',
 'watch',
 'the',
 'movie',
 'and',
 'sorta',
 'find',
 'out',
 'critique',
 'a',
 'mind',
 'fuck',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'and',
 'such',
 'lost',
 'highway',
 'memento',
 'but',
 'there',
 'are',
 'good',
 'and',
 'bad',
 'ways',

In [7]:
from nltk.corpus import stopwords
nltk.download("stopwords")
# Remove stopwords from the corpus
stopwords_english = stopwords.words('english')
 
# Create a new list of words by removing stopwords from all_words
for i in range(len(reviews)):
    reviews[i] = [word for word in reviews[i] if word not in stopwords_english]
    
reviews[0] 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Paula\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [8]:
# Remove numbers from the corpus
for i in range(len(reviews)):
    reviews[i] = [word for word in reviews[i] if not word.isdigit()]
    
reviews[0] 

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [9]:
# Prepare document set for stemming
reviews = [' '.join(i) for i in reviews]

In [10]:
# Define function for tokenizing documents
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
    return stems

**TF-IDF**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Build TF-IDF matrix
tfidf = TfidfVectorizer(tokenizer=tokenize)
movie_tfidf = tfidf.fit_transform(reviews)

In [12]:
from scipy.sparse import find
# Examine non-zero entries in TF-IDF matrix
row_inds, col_inds, vals = find(movie_tfidf[1:6,1:10])
vals

array([], dtype=float64)

**Cosine Similarity**

In [13]:
from scipy.spatial.distance import cosine
import numpy as np
# Calculate pair-wise cosine similarities between first 200 documents
movie_cosine = np.empty((200, 200))
for ii in range(0, movie_cosine.shape[0]):
    for jj in range(ii, movie_cosine.shape[0]):
        movie_cosine[ii, jj] = cosine(movie_tfidf.getcol(ii).toarray(), 
                                      movie_tfidf.getcol(jj).toarray())
        movie_cosine[jj, ii] = movie_cosine[ii, jj]

print(movie_cosine[1:6,1:6])

[[0.         0.8922739  1.         1.         1.        ]
 [0.8922739  0.         0.19796248 0.19796248 0.49547518]
 [1.         0.19796248 0.         0.         0.65378604]
 [1.         0.19796248 0.         0.         0.65378604]
 [1.         0.49547518 0.65378604 0.65378604 0.        ]]
