In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import re
import contractions
import emoji
import json
import time

from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from numpy.dtypes import StringDType
from scipy.sparse import csr_matrix

In [2]:
# For tokenization
nltk.download('punkt')

# For removing stopwords
nltk.download('stopwords')

# For lemmatization
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
base_wd = os.getcwd()

csv_path = os.path.join("data", "animation.csv")

df = pd.read_csv(csv_path)

print(df.head(5))

     movie_id                                         movie_name  year  \
0   tt3915174                       Puss in Boots: The Last Wish  2022   
1   tt6718170                        The Super Mario Bros. Movie  2023   
2  tt26537229  Demon Slayer: Kimetsu No Yaiba - To the Swords...  2023   
3   tt1488589                     Guillermo del Toro's Pinocchio  2022   
4  tt14668630                              Lyle, Lyle, Crocodile  2022   

  certificate  runtime                         genre  rating  \
0          PG  102 min  Animation, Adventure, Comedy     7.9   
1          PG   92 min  Animation, Adventure, Comedy     NaN   
2           R  110 min  Animation, Action, Adventure     6.6   
3          PG  117 min      Animation, Drama, Family     7.6   
4          PG  106 min  Animation, Adventure, Comedy     6.1   

                                         description  \
0  When Puss in Boots discovers that his passion ...   
1  The story of The Super Mario Bros. on their jo...   
2 

In [4]:
"""
This is a dataset scraped from the IMDB movie rating website regarding movies of the 'animation' genre.

This dataset is provided from: https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre?select=animation.csv 

"""

"\nThis is a dataset scraped from the IMDB movie rating website regarding movies of the 'animation' genre.\n\nThis dataset is provided from: https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre?select=animation.csv \n\n"

In [5]:
print(f"Number of movies: {len(df)}\n")

print("Example movie:")
print(df.iloc[0])

Number of movies: 8419

Example movie:
movie_id                                               tt3915174
movie_name                          Puss in Boots: The Last Wish
year                                                        2022
certificate                                                   PG
runtime                                                  102 min
genre                               Animation, Adventure, Comedy
rating                                                       7.9
description    When Puss in Boots discovers that his passion ...
director                       Joel Crawford, \r\nJanuel Mercado
director_id                                     /name/nm3150455/
star           Antonio Banderas, \r\nSalma Hayek, \r\nHarvey ...
star_id        /name/nm2591093/,/name/nm0000104/,/name/nm0000...
votes                                                    93143.0
gross(in $)                                          168464485.0
Name: 0, dtype: object


In [6]:
## Removing unnessary features

"""
Important informations about a movie that is memorable and easily identified which users can remember:
    --> Movie name
    --> Year
    --> Genre
    --> Description
    --> Director
    --> Star

We will only save these informations since other fields of data are not that significant for the users.

Hence, we will drop features of:
    --> movie_id
    --> certificate
    --> runtime
    --> rating
    --> director_id
    --> star_id
    --> votes
    --> gross(in $)

"""

new_df = df.drop('movie_id', axis='columns')
new_df = new_df.drop('certificate', axis='columns')
new_df = new_df.drop('runtime', axis='columns')
new_df = new_df.drop('rating', axis='columns')
new_df = new_df.drop('director_id', axis='columns')
new_df = new_df.drop('star_id', axis='columns')
new_df = new_df.drop('votes', axis='columns')
new_df = new_df.drop('gross(in $)', axis='columns')

print("After retaining wanted features: \n")
print(new_df.head(3))

After retaining wanted features: 

                                          movie_name  year  \
0                       Puss in Boots: The Last Wish  2022   
1                        The Super Mario Bros. Movie  2023   
2  Demon Slayer: Kimetsu No Yaiba - To the Swords...  2023   

                          genre  \
0  Animation, Adventure, Comedy   
1  Animation, Adventure, Comedy   
2  Animation, Action, Adventure   

                                         description  \
0  When Puss in Boots discovers that his passion ...   
1  The story of The Super Mario Bros. on their jo...   
2  All the Upper Rank Demons assemble at the Infi...   

                             director  \
0   Joel Crawford, \r\nJanuel Mercado   
1  Aaron Horvath, \r\nMichael Jelenic   
2                      Haruo Sotozaki   

                                                star  
0  Antonio Banderas, \r\nSalma Hayek, \r\nHarvey ...  
1  Chris Pratt, \r\nAnya Taylor-Joy, \r\nCharlie ...  
2  Zach Aguilar, \r\

In [7]:
## Check for any missing values in the dataset

def nan_counter(feature_name: str, df: pd.DataFrame):
    nan_count = df[feature_name].isnull().sum()

    print(f"Missing values in {feature_name}: {nan_count}")

nan_counter("movie_name", new_df)
nan_counter("year", new_df)
nan_counter("genre", new_df)
nan_counter("description", new_df)
nan_counter("director", new_df)
nan_counter("star", new_df)

Missing values in movie_name: 0
Missing values in year: 1369
Missing values in genre: 0
Missing values in description: 0
Missing values in director: 902
Missing values in star: 2849


In [8]:
## Dropping rows of data which are missing values

# Due to the nature of movie recommendations must provide real data, we cannot replace missing data using simple techniques such as imputation technique or forward/backward fill

# As such we must drop them

new_df = new_df.dropna(axis=0)

print(f"Number of movies after cleansing dataset: {len(new_df)}\n")

print(f"Example of movie:\n{new_df.iloc[450]}")

Number of movies after cleansing dataset: 5291

Example of movie:
movie_name                                        Bigfoot Family
year                                                        2020
genre                               Animation, Adventure, Family
description    Follow up to Son of Bigfoot: Father uses his n...
director                        Jeremy Degruson, \r\nBen Stassen
star           Jules Medcraft, \r\nKylian Trouillard, \r\nAle...
Name: 457, dtype: object


In [9]:
## Dropping rows of duplicate values

new_df = new_df.drop_duplicates(subset=['movie_name'])

print(f"Number of movies after cleansing dataset: {len(new_df)}")

Number of movies after cleansing dataset: 5245


In [10]:
## Remove movies that don't have plot description or filler descriptions

new_df = new_df[new_df["description"].str.contains("Add a Plot") == False]

print(f"Number of movies after cleansing dataset: {len(new_df)}")

Number of movies after cleansing dataset: 4521


In [11]:
## Creating a short description of dataset

print(new_df.describe())

                                    movie_name  year      genre  \
count                                     4521  4521       4521   
unique                                    4521    98        151   
top     Dive Olly Dive: A Hero's Magical Quest  2019  Animation   
freq                                         1   306        791   

                                              description   director  \
count                                                4521       4521   
unique                                               4509       3206   
top     Short animation film from the series 'Garabatos'.  Leon Ding   
freq                                                    7         32   

                                                     star  
count                                                4521  
unique                                               4300  
top     Nobuyo Ôyama, \r\nNoriko Ohara, \r\nMichiko No...  
freq                                                   20  


In [30]:
## Save new dataframe as csv

base_wd = os.getcwd()

df_csv_path = os.path.join(base_wd, "data", "cleanedAnimation.csv")

new_df.to_csv(df_csv_path)

In [12]:
## Create a huge array to store BagOfWords of each movie information

df_str = []

for i in range(len(new_df)):
    temp_str = ""
    for j in range(1, len(new_df.iloc[0])): # Exclude the name of movies to be included
         temp_str = temp_str + str(new_df.iloc[i][j]) + " "
    
    df_str.append(temp_str)

print(f"Number of movies: {len(df_str)}")
print(f"\nExample of movie information string: \n{df_str[0]}")

  temp_str = temp_str + str(new_df.iloc[i][j]) + " "


Number of movies: 4521

Example of movie information string: 
2022 Animation, Adventure, Comedy When Puss in Boots discovers that his passion for adventure has taken its toll and he has burned through eight of his nine lives, he launches an epic journey to restore them by finding the mythical Last Wish. Joel Crawford, 
Januel Mercado Antonio Banderas, 
Salma Hayek, 
Harvey Guillén, 
Florence Pugh 


In [None]:
## Create preprocess pipeline for the text strings per movie

def preprocess_text(text_str: str):

    """
    Remove characters that are not characters of 
        --> a - z
        --> A - Z
        --> 0 - 9
    """

    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text_str) 

    # Turns all uppercase alphabets to lowercase
    clean_text = clean_text.lower()

    # Tokenize string of text into individual units
    tokenized_text = word_tokenize(clean_text)

    # Remove stopwords which provide little to none useful information
    stop_words = set(stopwords.words('english'))
    
    filtered_text = [token for token in tokenized_text if token not in stop_words]

    # Lemmatization of tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = []

    for token in filtered_text:
        lemmatized_text.append(lemmatizer.lemmatize(token))
    
    # Process contractions (words with apostrophe) by replacing them with words of similar meaning
    expanded_text = [contractions.fix(token) for token in lemmatized_text]

    # Handle emojis and emoticons
    emoji_clean_text = [emoji.demojize(token) for token in expanded_text]

    # Rejoin into a BoW for each token in a movie information
    string_BoW = " ".join(emoji_clean_text)

    return string_BoW

In [14]:
## Example of Text Preprocessing

print(f"Before:\n {df_str[0]}\n")

print(f"After:\n {preprocess_text(df_str[0])}")

Before:
 2022 Animation, Adventure, Comedy When Puss in Boots discovers that his passion for adventure has taken its toll and he has burned through eight of his nine lives, he launches an epic journey to restore them by finding the mythical Last Wish. Joel Crawford, 
Januel Mercado Antonio Banderas, 
Salma Hayek, 
Harvey Guillén, 
Florence Pugh 

After:
 2022 animation adventure comedy pus boot discovers passion adventure taken toll burned eight nine life launch epic journey restore finding mythical last wish joel crawford januel mercado antonio banderas salma hayek harvey guilln florence pugh 


In [22]:
## Preprocess BagOfWords

preprocessed_bow = []

for i in tqdm(range(0, len(df_str)), desc="Preprocessing BagOfWords"):
    processed_tokens = preprocess_text(df_str[i])

    if processed_tokens == None:
        print(f"{i}\n")
        continue
    
    preprocessed_bow.append(processed_tokens)

print("\nBoW Preprocessing Completed")

Preprocessing BagOfWords: 100%|██████████| 4521/4521 [00:02<00:00, 2204.27it/s]


BoW Preprocessing Completed





In [16]:
## Save processed BoW as json format

base_wd = os.getcwd()

jsonfile_name = "ori_BoW.json"

jsonfile_path = os.path.join(base_wd, "data", jsonfile_name)

In [17]:
with open(jsonfile_path, "w") as jsonFile:
    json.dump(preprocessed_bow, jsonFile)

print("BagOfWords array has been converted into a json file")

BagOfWords array has been converted into a json file


In [18]:
## Load BoW from json file

if os.path.isfile(jsonfile_path):
    with open(jsonfile_path, 'r') as file:
        bagOfWords = json.load(file)

        print(bagOfWords[:3])
else:
    print("\nJson file is not found in directory. Please save it first.")

['2022 animation adventure comedy pus boot discovers passion adventure taken toll burned eight nine life launch epic journey restore finding mythical last wish joel crawford januel mercado antonio banderas salma hayek harvey guilln florence pugh ', '2023 animation adventure comedy story super mario bros journey mushroom kingdom aaron horvath michael jelenic chris pratt anya taylorjoy charlie day jack black ', '2023 animation action adventure upper rank demon assemble infinity castle upper six demon defeat haruo sotozaki zach aguilar kira buckland griffin burn ray chase ']


In [19]:
## Extract features from BagOfWords

# Create a vectorizer object
vectorizer = CountVectorizer()

vectorizer.fit(bagOfWords)

# Encode document

vector = vectorizer.transform(bagOfWords)

print(f"Shape of vector matrix: {vector.shape}")

print(f"\nFirst 3 rows of Vector: \n{vector[:3]}")

Shape of vector matrix: (4521, 28787)

First 3 rows of Vector: 
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 79 stored elements and shape (3, 28787)>
  Coords	Values
  (0, 198)	1
  (0, 590)	2
  (0, 1367)	1
  (0, 1467)	1
  (0, 2468)	1
  (0, 3596)	1
  (0, 4176)	1
  (0, 5619)	1
  (0, 6091)	1
  (0, 7224)	1
  (0, 7969)	1
  (0, 8377)	1
  (0, 9205)	1
  (0, 9349)	1
  (0, 10889)	1
  (0, 11329)	1
  (0, 11419)	1
  (0, 13155)	1
  (0, 13417)	1
  (0, 13524)	1
  (0, 15239)	1
  (0, 15263)	1
  (0, 15565)	1
  (0, 17181)	1
  (0, 18203)	1
  :	:
  (1, 24644)	1
  (1, 24911)	1
  (1, 25467)	1
  (2, 199)	1
  (2, 477)	1
  (2, 590)	1
  (2, 688)	1
  (2, 1367)	1
  (2, 1920)	1
  (2, 4059)	1
  (2, 4175)	1
  (2, 4647)	1
  (2, 4932)	1
  (2, 6715)	1
  (2, 6832)	2
  (2, 10744)	1
  (2, 11325)	1
  (2, 12616)	1
  (2, 14412)	1
  (2, 21223)	1
  (2, 21298)	1
  (2, 23859)	1
  (2, 24227)	1
  (2, 26842)	2
  (2, 28528)	1


In [None]:
## Create and save a list of movies name in BoW for easier checking later on:

# Create a huge array to store each movie name
movie_names = []

for i in range(len(new_df)):

    temp_str = str(new_df.iloc[i]['movie_name'])
    
    movie_names.append(temp_str)

print("\nMovie names has been extracted from CSV file.")

# Preprocess movie names

preprocessed_names = []

for i in tqdm(range(0, len(movie_names)), desc="Preprocessing Movie Names"):
    processed_tokens = preprocess_text(movie_names[i])

    preprocessed_names.append(processed_tokens)

print("\nMovie Names Preprocessing Completed.")

# Show example of movie names before and after
print(f"Before:\n {movie_names[0]}\n")

print(f"After:\n {preprocessed_names[0]}")

## Save processed movie names as json format

base_wd = os.getcwd()

jsonfile_name = "names_BoW.json"

jsonfile_path = os.path.join(base_wd, "data", jsonfile_name)

with open(jsonfile_path, "w") as jsonFile:
    json.dump(preprocessed_names, jsonFile)

print("Preprocessed movie names has been converted into a json file")


Movie names has been extracted from CSV file.


Preprocessing Movie Names: 100%|██████████| 4521/4521 [00:00<00:00, 6067.27it/s]


Movie Names Preprocessing Completed.
Before:
 Puss in Boots: The Last Wish

After:
 pus boot last wish 
Preprocessed movie names has been converted into a json file





In [20]:
"""
Since everything is going smoothly, now the plan is to create a python script with the following features:
    --> 1. Ask user to write a sentence about a movie they would like to see
    --> 2. Read the sentence into a string
    --> 3. Preprocess the string into a BoW
    --> 4. Attach the BoW to the array of BoWs that we have saved 
    --> 5. Perform a cosine similarity calculation of the matrix to find out what index of movies has the highest similarity with the description (Top 5)
    --> 6. Extract the top 5 similarity score 
    --> 6. Extract information of the top 5 movies saved in the pandas Dataframe
    --> 7. Display the first movie with all information sided with a confidence score
    --> 8. Ask if this is the movie they want
    --> 9. If not repeat steps 7 and 8 for the rest of the 4 movies until movies recommendation ran out
    --> 10. Loop back to 1.
"""

'\nSince everything is going smoothly, now the plan is to create a python script with the following features:\n    --> 1. Ask user to write a sentence about a movie they would like to see\n    --> 2. Read the sentence into a string\n    --> 3. Preprocess the string into a BoW\n    --> 4. Attach the BoW to the array of BoWs that we have saved \n    --> 5. Perform a cosine similarity calculation of the matrix to find out what index of movies has the highest similarity with the description (Top 5)\n    --> 6. Extract the top 5 similarity score \n    --> 6. Extract information of the top 5 movies saved in the pandas Dataframe\n    --> 7. Display the first movie with all information sided with a confidence score\n    --> 8. Ask if this is the movie they want\n    --> 9. If not repeat steps 7 and 8 for the rest of the 4 movies until movies recommendation ran out\n    --> 10. Loop back to 1.\n'