In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import re
import contractions
import emoji
import json
import time

from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# For tokenization
nltk.download('punkt')

# For removing stopwords
nltk.download('stopwords')

# For lemmatization
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
"""
This is a dataset which contains information of the top 100 most popular animation movie based on the IMDB movie rating website

This is a dataset scraped from IMDB movie rating website for features such as:
    -> Movie name
    -> Year of screening
    -> Genre
    -> Director
    -> Star

The dataset is combined with another feature scraped from Wikipedia which is the:
    -> Plot summary

"""

# Read dataset CSV and print first 5 examples

base_wd = os.getcwd()

csv_path = os.path.join("data", "movieData.csv")

df = pd.read_csv(csv_path)

print(df.head(5))

   No                   Movie Name  Year  \
0   1                Inside Out: 2  2024   
1   2                Incredibles 2  2018   
2   3  The Super Mario Bros. Movie  2023   
3   4                The Lion King  2019   
4   5                 Finding Dory  2016   

                                               Genre  \
0  Animation, Adventure, Comedy, Drama, Family, F...   
1                       Animation, Action, Adventure   
2                       Animation, Adventure, Comedy   
3                        Animation, Adventure, Drama   
4                       Animation, Adventure, Comedy   

                                        Plot Summary  \
0  Two years after her move to San Francisco,[d] ...   
1  The Parr family (alias The Incredibles) and Lu...   
2  Italian-American brothers Mario and Luigi oper...   
3  In the Pride Lands of Tanzania, a pride of lio...   
4  Dory, the regal blue tang, gets separated from...   

                                       Director  \
0         

In [4]:
print(f"Number of movies: {len(df)}\n")

print("Example movie:")
print(df.iloc[0])

Number of movies: 100

Example movie:
No                                                              1
Movie Name                                          Inside Out: 2
Year                                                         2024
Genre           Animation, Adventure, Comedy, Drama, Family, F...
Plot Summary    Two years after her move to San Francisco,[d] ...
Director                                              Kelsey Mann
Star                  Amy Poehler, Maya Hawke, Kensington Tallman
Name: 0, dtype: object


In [5]:
## Remove Movie No which is irrelevant for data

df = df.drop('No', axis='columns')

In [6]:
## Check for any missing values in the dataset

def nan_counter(feature_name: str, df: pd.DataFrame):
    nan_count = df[feature_name].isnull().sum()

    print(f"Missing values in {feature_name}: {nan_count}")

nan_counter("Movie Name", df)
nan_counter("Year", df)
nan_counter("Genre", df)
nan_counter("Plot Summary", df)
nan_counter("Director", df)
nan_counter("Star", df)

Missing values in Movie Name: 0
Missing values in Year: 0
Missing values in Genre: 0
Missing values in Plot Summary: 0
Missing values in Director: 0
Missing values in Star: 0


In [7]:
## Creating a short description of dataset

print(df.describe())

              Year
count   100.000000
mean   2009.920000
std      12.595237
min    1937.000000
25%    2006.000000
50%    2012.000000
75%    2017.000000
max    2024.000000


In [8]:
## Remove newline characters in the dataset

df = df.replace('\n',' ', regex=True)

In [9]:
## Save new dataframe as csv

base_wd = os.getcwd()

df_csv_path = os.path.join(base_wd, "data", "cleanedAnimation.csv")

df.to_csv(df_csv_path)

In [10]:
## Create a huge array to store BagOfWords of each movie information

df_str = []

for i in range(len(df)):
    temp_str = ""
    for j in range(1, len(df.iloc[0])): # Exclude the name of movies to be included
         temp_str = temp_str + str(df.iloc[i][j]) + " "
    
    df_str.append(temp_str)

print(f"\nNumber of movies: {len(df_str)}")
print(f"\nExample of movie information string: \n{df_str[0]}")


Number of movies: 100

Example of movie information string: 
2024 Animation, Adventure, Comedy, Drama, Family, Fantasy Two years after her move to San Francisco,[d] 13-year-old Riley Andersen is entering high school. Her personified emotions Joy, Sadness, Fear, Disgust, and Anger now oversee a newly formed element of Riley's mind called her "Sense of Self", which houses memories and feelings that shape Riley's beliefs. Joy, aiming to fill the Sense of Self with only good memories, has created a mechanism that launches bad memories to the back of Riley's mind. Riley and her best friends, Bree and Grace, are invited to a three-day ice hockey camp, where Riley hopes to qualify for her new school's team, the Fire Hawks. In Headquarters, a "puberty alarm" from the previous movie goes off the night before camp, and several mind workers abruptly upgrade the emotion console, leaving Headquarters in disarray. The emotions find that Riley now overreacts to any inputs they make to the console. F

  temp_str = temp_str + str(df.iloc[i][j]) + " "


In [11]:
## Create preprocess pipeline for the text strings per movie

def preprocess_text(text_str: str):

    """
    Remove characters that are not characters of 
        --> a - z
        --> A - Z
        --> 0 - 9
    """

    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text_str) 

    # Turns all uppercase alphabets to lowercase
    clean_text = clean_text.lower()

    # Tokenize string of text into individual units
    tokenized_text = word_tokenize(clean_text)

    # Remove stopwords which provide little to none useful information
    stop_words = set(stopwords.words('english'))
    
    filtered_text = [token for token in tokenized_text if token not in stop_words]

    # Lemmatization of tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = []

    for token in filtered_text:
        lemmatized_text.append(lemmatizer.lemmatize(token))
    
    # Process contractions (words with apostrophe) by replacing them with words of similar meaning
    expanded_text = [contractions.fix(token) for token in lemmatized_text]

    # Handle emojis and emoticons
    emoji_clean_text = [emoji.demojize(token) for token in expanded_text]

    # Remove "None" type tokens from the list
    final_text = [token for token in emoji_clean_text if token != None]

    # Rejoin into a BoW for each token in a movie information
    string_BoW = " ".join(final_text)

    return string_BoW

In [12]:
## Example of Text Preprocessing

print(f"Before:\n {df_str[0]}\n")

print(f"After:\n {preprocess_text(df_str[0])}")

Before:
 2024 Animation, Adventure, Comedy, Drama, Family, Fantasy Two years after her move to San Francisco,[d] 13-year-old Riley Andersen is entering high school. Her personified emotions Joy, Sadness, Fear, Disgust, and Anger now oversee a newly formed element of Riley's mind called her "Sense of Self", which houses memories and feelings that shape Riley's beliefs. Joy, aiming to fill the Sense of Self with only good memories, has created a mechanism that launches bad memories to the back of Riley's mind. Riley and her best friends, Bree and Grace, are invited to a three-day ice hockey camp, where Riley hopes to qualify for her new school's team, the Fire Hawks. In Headquarters, a "puberty alarm" from the previous movie goes off the night before camp, and several mind workers abruptly upgrade the emotion console, leaving Headquarters in disarray. The emotions find that Riley now overreacts to any inputs they make to the console. Four new emotions Anxiety, Envy, Embarrassment, and En

In [13]:
## Preprocess BagOfWords

preprocessed_bow = []

for i in tqdm(range(0, len(df_str)), desc="Preprocessing BagOfWords"):
    processed_tokens = preprocess_text(df_str[i])

    if processed_tokens == None:
        print(f"{i}\n")
        continue
    
    preprocessed_bow.append(processed_tokens)

print("\nBoW Preprocessing Completed")

Preprocessing BagOfWords: 100%|██████████| 100/100 [00:01<00:00, 87.91it/s]


BoW Preprocessing Completed





In [14]:
## Save processed BoW as json format

base_wd = os.getcwd()

jsonfile_name = "ori_BoW.json"

jsonfile_path = os.path.join(base_wd, "data", jsonfile_name)

In [15]:
with open(jsonfile_path, "w") as jsonFile:
    json.dump(preprocessed_bow, jsonFile)

print("BagOfWords array has been converted into a json file")

BagOfWords array has been converted into a json file


In [16]:
## Load BoW from json file

if os.path.isfile(jsonfile_path):
    with open(jsonfile_path, 'r') as file:
        bagOfWords = json.load(file)

        print(bagOfWords[:3])
else:
    print("\nJson file is not found in directory. Please save it first.")

['2024 animation adventure comedy drama family fantasy two year move san franciscod 13yearold riley andersen entering high school personified emotion joy sadness fear disgust anger oversee newly formed element riley mind called sense self house memory feeling shape riley belief joy aiming fill sense self good memory created mechanism launch bad memory back riley mind riley best friend bree grace invited threeday ice hockey camp riley hope qualify new school team fire hawk headquarters puberty alarm previous movie go night camp several mind worker abruptly upgrade emotion console leaving headquarters disarray emotion find riley overreacts input make console four new emotion anxiety envy embarrassment ennui arrive clash original emotion approach particular joy want riley fun camp anxiety focus winning spot team making new friend especially riley learns bree grace attending different high school joy control riley inadvertently get camper trouble strict camp director coach robert anxiety d

In [17]:
## Extract features from BagOfWords

# Create a vectorizer object
vectorizer = CountVectorizer()

vectorizer.fit(bagOfWords)

# Encode document

vector = vectorizer.transform(bagOfWords)

print(f"Shape of vector matrix: {vector.shape}")

print(f"\nFirst 3 rows of Vector: \n{vector[:3]}")

Shape of vector matrix: (100, 7864)

First 3 rows of Vector: 
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 743 stored elements and shape (3, 7864)>
  Coords	Values
  (0, 6)	1
  (0, 62)	1
  (0, 106)	1
  (0, 124)	1
  (0, 204)	1
  (0, 248)	1
  (0, 262)	1
  (0, 304)	1
  (0, 338)	1
  (0, 342)	1
  (0, 350)	1
  (0, 361)	1
  (0, 393)	12
  (0, 394)	1
  (0, 407)	1
  (0, 432)	1
  (0, 472)	1
  (0, 544)	2
  (0, 554)	1
  (0, 616)	4
  (0, 618)	1
  (0, 624)	4
  (0, 711)	1
  (0, 713)	1
  (0, 723)	1
  :	:
  (2, 6988)	1
  (2, 7007)	1
  (2, 7032)	1
  (2, 7062)	1
  (2, 7063)	1
  (2, 7069)	1
  (2, 7121)	1
  (2, 7181)	9
  (2, 7185)	1
  (2, 7198)	1
  (2, 7212)	1
  (2, 7246)	1
  (2, 7278)	1
  (2, 7386)	1
  (2, 7469)	1
  (2, 7472)	1
  (2, 7475)	1
  (2, 7478)	3
  (2, 7482)	1
  (2, 7610)	1
  (2, 7623)	2
  (2, 7638)	1
  (2, 7643)	1
  (2, 7667)	2
  (2, 7780)	1


In [18]:
## Create and save a list of movies name in BoW for easier checking later on:

# Create a huge array to store each movie name
movie_names = []

for i in range(len(df)):

    temp_str = str(df.iloc[i]['Movie Name'])
    
    movie_names.append(temp_str)

print("\nMovie names has been extracted from CSV file.")

# Preprocess movie names

preprocessed_names = []

for i in tqdm(range(0, len(movie_names)), desc="Preprocessing Movie Names"):
    processed_tokens = preprocess_text(movie_names[i])

    preprocessed_names.append(processed_tokens)

print("\nMovie Names Preprocessing Completed.")

# Show example of movie names before and after
print(f"Before:\n {movie_names[0]}\n")

print(f"After:\n {preprocessed_names[0]}")

## Save processed movie names as json format

base_wd = os.getcwd()

jsonfile_name = "names_BoW.json"

jsonfile_path = os.path.join(base_wd, "data", jsonfile_name)

with open(jsonfile_path, "w") as jsonFile:
    json.dump(preprocessed_names, jsonFile)

print("Preprocessed movie names has been converted into a json file")


Movie names has been extracted from CSV file.


Preprocessing Movie Names: 100%|██████████| 100/100 [00:00<00:00, 1570.54it/s]


Movie Names Preprocessing Completed.
Before:
 Inside Out: 2

After:
 inside 2
Preprocessed movie names has been converted into a json file





In [19]:
"""
Since everything is going smoothly, now the plan is to create a python script with the following features:
    --> 1. Ask user to write a sentence about a movie they would like to see
    --> 2. Read the sentence into a string
    --> 3. Preprocess the string into a BoW
    --> 4. Attach the BoW to the array of BoWs that we have saved 
    --> 5. Perform a cosine similarity calculation of the matrix to find out what index of movies has the highest similarity with the description (Top 5)
    --> 6. Extract the top 5 similarity score 
    --> 6. Extract information of the top 5 movies saved in the pandas Dataframe
    --> 7. Display the first movie with all information sided with a confidence score
    --> 8. Ask if this is the movie they want
    --> 9. If not repeat steps 7 and 8 for the rest of the 4 movies until movies recommendation ran out
    --> 10. Loop back to 1.
"""

'\nSince everything is going smoothly, now the plan is to create a python script with the following features:\n    --> 1. Ask user to write a sentence about a movie they would like to see\n    --> 2. Read the sentence into a string\n    --> 3. Preprocess the string into a BoW\n    --> 4. Attach the BoW to the array of BoWs that we have saved \n    --> 5. Perform a cosine similarity calculation of the matrix to find out what index of movies has the highest similarity with the description (Top 5)\n    --> 6. Extract the top 5 similarity score \n    --> 6. Extract information of the top 5 movies saved in the pandas Dataframe\n    --> 7. Display the first movie with all information sided with a confidence score\n    --> 8. Ask if this is the movie they want\n    --> 9. If not repeat steps 7 and 8 for the rest of the 4 movies until movies recommendation ran out\n    --> 10. Loop back to 1.\n'