# 1. Import and read dataset

In [1]:
# Import modules
import numpy as np
import pandas as pd
import nltk

# Set seed for reproducibility
np.random.seed(5)

# Read in IMDb and Wikipedia movie data (both in same file)
movies_df = pd.read_csv('datasets/wiki_movie_plots_deduped.csv', skiprows=range(1, 10000), nrows=100)

print("Number of movies loaded: %s " % (len(movies_df)))

# Display the data
movies_df.head()

Number of movies loaded: 100 


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1982,Parasite,American,Charles Band,"Demi Moore, Robert Glaudini",horror,https://en.wikipedia.org/wiki/Parasite_(film),"In the near future, an atomic disaster has red..."
1,1982,Partners,American,James Burrows,"John Hurt, Ryan O'Neal",comedy,https://en.wikipedia.org/wiki/Partners_(1982_f...,After a series of murders in Los Angeles's gay...
2,1982,Personal Best,American,Robert Towne,"Mariel Hemingway, Scott Glenn, Patrice Donnelly","drama, sports",https://en.wikipedia.org/wiki/Personal_Best_(f...,Chris Cahill is a young athlete who competes u...
3,1982,Poltergeist,American,Tobe Hooper,"Craig T. Nelson, Beatrice Straight, Dominique ...",horror,https://en.wikipedia.org/wiki/Poltergeist_(198...,Steven and Diane Freeling live a quiet life in...
4,1982,Porky's,American,Bob Clark,"Kim Cattrall, Mark Herrier, Wyatt Knight",comedy,https://en.wikipedia.org/wiki/Porky%27s,A group of Florida high school students plan o...


# 2. Tokenization

In [2]:
import nltk

user_input = "In Antarctica in 1982, a helicopter pursues a sled dog to an American research station, firing at the dog and dropping grenades without success. The helicopter lands and the researchers witness one of the two occupants accidentally blow up the helicopter and himself with a grenade. The remaining man shoots at the dog and shouts at the Americans in Norwegian, but they are unable to understand him. He is shot dead in self-defense by station commander Garry. The American helicopter pilot, R.J. MacReady, and Dr. Copper leave to investigate the Norwegian base. Among the charred ruins and frozen corpses, they find the burnt corpse of a malformed humanoid, which they transfer to the American station. Their biologist, Blair, performs an autopsy on the remains and finds a normal set of human organs."

# Tokenize a paragraph into sentences and store in sent_tokenized
sent_tokenized = [sent for sent in nltk.sent_tokenize(user_input)]

# Word Tokenize first sentence from sent_tokenized, save as words_tokenized
words_tokenized = [word for word in nltk.word_tokenize(sent_tokenized[0])]

# Remove tokens that do not contain any letters from words_tokenized
import re

filtered = [word for word in words_tokenized if re.search('[a-zA-Z]', word)]

# Display filtered words to observe words after tokenization
filtered

['In',
 'Antarctica',
 'in',
 'a',
 'helicopter',
 'pursues',
 'a',
 'sled',
 'dog',
 'to',
 'an',
 'American',
 'research',
 'station',
 'firing',
 'at',
 'the',
 'dog',
 'and',
 'dropping',
 'grenades',
 'without',
 'success']

# 3. Stemming

In [4]:
# Import the SnowballStemmer to perform stemming
# ... YOUR CODE FOR TASK 4 ...
from nltk.stem.snowball import SnowballStemmer

# Create an English language SnowballStemmer object
stemmer = SnowballStemmer("english")

# Print filtered to observe words without stemming
print("Without stemming: ", filtered)

# Stem the words from filtered and store in stemmed_words
stemmed_words = [stemmer.stem(word) for word in filtered]

# Print the stemmed_words to observe words after stemming
print("After stemming:   ", stemmed_words)

Without stemming:  ['In', 'Antarctica', 'in', 'a', 'helicopter', 'pursues', 'a', 'sled', 'dog', 'to', 'an', 'American', 'research', 'station', 'firing', 'at', 'the', 'dog', 'and', 'dropping', 'grenades', 'without', 'success']
After stemming:    ['in', 'antarctica', 'in', 'a', 'helicopt', 'pursu', 'a', 'sled', 'dog', 'to', 'an', 'american', 'research', 'station', 'fire', 'at', 'the', 'dog', 'and', 'drop', 'grenad', 'without', 'success']


# 4. Tokenize and Stem together

In [5]:
# Define a function to perform both stemming and tokenization
def tokenize_and_stem(text):
    
    # Tokenize by sentence, then by word
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # Filter out raw tokens to remove noise
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    # Stem the filtered_tokens
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    return stems

user_input_copy = """In Antarctica in 1982, a helicopter pursues a sled dog to an American research station, firing at the dog and 
dropping grenades without success. The helicopter lands and the researchers witness one of the two occupants accidentally blow 
up the helicopter and himself with a grenade. The remaining man shoots at the dog and shouts at the Americans in Norwegian, 
but they are unable to understand him. He is shot dead in self-defense by station commander Garry. The American helicopter 
pilot, R.J. MacReady, and Dr. Copper leave to investigate the Norwegian base. Among the charred ruins and frozen corpses, 
they find the burnt corpse of a malformed humanoid, which they transfer to the American station. Their biologist, Blair, 
performs an autopsy on the remains and finds a normal set of human organs."""

words_stemmed = tokenize_and_stem(user_input)
print(words_stemmed)

['in', 'antarctica', 'in', 'a', 'helicopt', 'pursu', 'a', 'sled', 'dog', 'to', 'an', 'american', 'research', 'station', 'fire', 'at', 'the', 'dog', 'and', 'drop', 'grenad', 'without', 'success', 'the', 'helicopt', 'land', 'and', 'the', 'research', 'wit', 'one', 'of', 'the', 'two', 'occup', 'accident', 'blow', 'up', 'the', 'helicopt', 'and', 'himself', 'with', 'a', 'grenad', 'the', 'remain', 'man', 'shoot', 'at', 'the', 'dog', 'and', 'shout', 'at', 'the', 'american', 'in', 'norwegian', 'but', 'they', 'are', 'unabl', 'to', 'understand', 'him', 'he', 'is', 'shot', 'dead', 'in', 'self-defens', 'by', 'station', 'command', 'garri', 'the', 'american', 'helicopt', 'pilot', 'r.j.', 'macreadi', 'and', 'dr.', 'copper', 'leav', 'to', 'investig', 'the', 'norwegian', 'base', 'among', 'the', 'char', 'ruin', 'and', 'frozen', 'corps', 'they', 'find', 'the', 'burnt', 'corps', 'of', 'a', 'malform', 'humanoid', 'which', 'they', 'transfer', 'to', 'the', 'american', 'station', 'their', 'biologist', 'blair',

# 5. Create TfidfVectorizer

In [6]:
# Import TfidfVectorizer to create TF-IDF vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer object with stopwords and tokenizer
# parameters for efficient processing of text
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem,
                                 ngram_range=(1,3))

# 6. Fit transform TfidfVectorizer

In [7]:
# Fit and transform the tfidf_vectorizer with the "plot" of each movie
# to create a vector representation of the plot summaries
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in movies_df["Plot"]])

print(tfidf_matrix.shape)



(100, 89)


# 7. Import KMeans and create clusters



1    39
2    24
4    20
3    11
0     6
Name: cluster, dtype: int64