In [41]:
import pandas as pd
import nltk
import os

In [42]:
base_wd = os.getcwd()

csv_path = os.path.join("data", "animation.csv")

df = pd.read_csv(csv_path)

print(df.head(5))

     movie_id                                         movie_name  year  \
0   tt3915174                       Puss in Boots: The Last Wish  2022   
1   tt6718170                        The Super Mario Bros. Movie  2023   
2  tt26537229  Demon Slayer: Kimetsu No Yaiba - To the Swords...  2023   
3   tt1488589                     Guillermo del Toro's Pinocchio  2022   
4  tt14668630                              Lyle, Lyle, Crocodile  2022   

  certificate  runtime                         genre  rating  \
0          PG  102 min  Animation, Adventure, Comedy     7.9   
1          PG   92 min  Animation, Adventure, Comedy     NaN   
2           R  110 min  Animation, Action, Adventure     6.6   
3          PG  117 min      Animation, Drama, Family     7.6   
4          PG  106 min  Animation, Adventure, Comedy     6.1   

                                         description  \
0  When Puss in Boots discovers that his passion ...   
1  The story of The Super Mario Bros. on their jo...   
2 

In [43]:
"""
This is a dataset scraped from the IMDB movie rating website regarding movies of the 'animation' genre.

This dataset is provided from: https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre?select=animation.csv 

"""

"\nThis is a dataset scraped from the IMDB movie rating website regarding movies of the 'animation' genre.\n\nThis dataset is provided from: https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre?select=animation.csv \n\n"

In [44]:
print(f"Number of movies: {len(df)}\n")

print("Example movie:")
print(df.iloc[0])

Number of movies: 8419

Example movie:
movie_id                                               tt3915174
movie_name                          Puss in Boots: The Last Wish
year                                                        2022
certificate                                                   PG
runtime                                                  102 min
genre                               Animation, Adventure, Comedy
rating                                                       7.9
description    When Puss in Boots discovers that his passion ...
director                         Joel Crawford, \nJanuel Mercado
director_id                                     /name/nm3150455/
star           Antonio Banderas, \nSalma Hayek, \nHarvey Guil...
star_id        /name/nm2591093/,/name/nm0000104/,/name/nm0000...
votes                                                    93143.0
gross(in $)                                          168464485.0
Name: 0, dtype: object


In [45]:
## Removing unnessary features

"""
Important informations about a movie that is memorable and easily identified which users can remember:
    --> Movie name
    --> Year
    --> Genre
    --> Description
    --> Director
    --> Star

We will only save these informations since other fields of data are not that significant for the users.

Hence, we will drop features of:
    --> movie_id
    --> certificate
    --> runtime
    --> rating
    --> director_id
    --> star_id
    --> votes
    --> gross(in $)

"""

new_df = df.drop('movie_id', axis='columns')
new_df = new_df.drop('certificate', axis='columns')
new_df = new_df.drop('runtime', axis='columns')
new_df = new_df.drop('rating', axis='columns')
new_df = new_df.drop('director_id', axis='columns')
new_df = new_df.drop('star_id', axis='columns')
new_df = new_df.drop('votes', axis='columns')
new_df = new_df.drop('gross(in $)', axis='columns')

print("After retaining wanted features: \n")
print(new_df.head(3))

After retaining wanted features: 

                                          movie_name  year  \
0                       Puss in Boots: The Last Wish  2022   
1                        The Super Mario Bros. Movie  2023   
2  Demon Slayer: Kimetsu No Yaiba - To the Swords...  2023   

                          genre  \
0  Animation, Adventure, Comedy   
1  Animation, Adventure, Comedy   
2  Animation, Action, Adventure   

                                         description  \
0  When Puss in Boots discovers that his passion ...   
1  The story of The Super Mario Bros. on their jo...   
2  All the Upper Rank Demons assemble at the Infi...   

                           director  \
0   Joel Crawford, \nJanuel Mercado   
1  Aaron Horvath, \nMichael Jelenic   
2                    Haruo Sotozaki   

                                                star  
0  Antonio Banderas, \nSalma Hayek, \nHarvey Guil...  
1  Chris Pratt, \nAnya Taylor-Joy, \nCharlie Day,...  
2  Zach Aguilar, \nKira Buck

In [46]:
## Check for any missing values in the dataset

def nan_counter(feature_name: str, df: pd.DataFrame):
    nan_count = df[feature_name].isnull().sum()

    print(f"Missing values in {feature_name}: {nan_count}")

nan_counter("movie_name", new_df)
nan_counter("year", new_df)
nan_counter("genre", new_df)
nan_counter("description", new_df)
nan_counter("director", new_df)
nan_counter("star", new_df)

Missing values in movie_name: 0
Missing values in year: 1369
Missing values in genre: 0
Missing values in description: 0
Missing values in director: 902
Missing values in star: 2849


In [47]:
## Dropping rows of data which are missing values

# Due to the nature of movie recommendations must provide real data, we cannot replace missing data using simple techniques such as imputation technique or forward/backward fill

# As such we must drop them

new_df = new_df.dropna(axis=0)

print(f"Number of movies after cleansing dataset: {len(new_df)}\n")

print(f"Example of movie:\n{new_df.iloc[450]}")

Number of movies after cleansing dataset: 5291

Example of movie:
movie_name                                        Bigfoot Family
year                                                        2020
genre                               Animation, Adventure, Family
description    Follow up to Son of Bigfoot: Father uses his n...
director                          Jeremy Degruson, \nBen Stassen
star           Jules Medcraft, \nKylian Trouillard, \nAlexis ...
Name: 457, dtype: object


In [48]:
## Dropping rows of duplicate values

new_df = new_df.drop_duplicates(subset=['movie_name'])

print(f"Number of movies after cleansing dataset: {len(new_df)}")

Number of movies after cleansing dataset: 5245


In [49]:
## Remove movies that don't have plot description or filler descriptions

new_df = new_df[new_df["description"].str.contains("Add a Plot") == False]

print(f"Number of movies after cleansing dataset: {len(new_df)}")

Number of movies after cleansing dataset: 4521


In [50]:
## Creating a short description of dataset

print(new_df.describe())

                                    movie_name  year      genre  \
count                                     4521  4521       4521   
unique                                    4521    98        151   
top     Dive Olly Dive: A Hero's Magical Quest  2019  Animation   
freq                                         1   306        791   

                                              description   director  \
count                                                4521       4521   
unique                                               4509       3206   
top     Short animation film from the series 'Garabatos'.  Leon Ding   
freq                                                    7         32   

                                                     star  
count                                                4521  
unique                                               4300  
top     Nobuyo Ôyama, \nNoriko Ohara, \nMichiko Nomura...  
freq                                                   20  
