In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [4]:
df = pd.read_csv("10kmovies.csv")

In [5]:
df.head()

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,758323,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,"Father Gabriele Amorth, Chief Exorcist of the ...",18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103,Inspired by the actual files of Father Gabriel...
1,640146,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,Super-Hero partners Scott Lang and Hope van Dy...,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,Witness the beginning of a new dynasty.
2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,"While working underground to fix a water main,...",100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,
3,868759,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,Salt-of-the-earth Cole falls head over heels f...,0,"['Skydance Media', 'Apple Studios']",0,120,Finding that special someone can be a real adv...
4,594767,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,"Billy Batson and his foster siblings, who tran...",125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,Oh. My. Gods.


In [6]:
df.shape

(10000, 14)

In [7]:
df.isnull().sum() #tagline,overview and release date have null values. Not interested in this so will drop them.

id                         0
title                      0
release_date              21
genres                     0
original_language          0
vote_average               0
vote_count                 0
popularity                 0
overview                  77
budget                     0
production_companies       0
revenue                    0
runtime                    0
tagline                 2759
dtype: int64

In [8]:
df.drop(columns = ['release_date','overview','tagline'], inplace = True) # Dropping the columns will null values.

In [9]:
df.isnull().sum()

id                      0
title                   0
genres                  0
original_language       0
vote_average            0
vote_count              0
popularity              0
budget                  0
production_companies    0
revenue                 0
runtime                 0
dtype: int64

In [10]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [11]:
df.shape

(10000, 11)

In [12]:
df.genres.unique()

array(["['Horror', 'Mystery', 'Thriller']",
       "['Action', 'Adventure', 'Science Fiction']",
       "['Animation', 'Adventure', 'Family', 'Fantasy', 'Comedy']", ...,
       "['Thriller', 'Drama', 'Music']",
       "['Fantasy', 'Animation', 'Action', 'Adventure', 'Science Fiction', 'Drama', 'Romance']",
       "['TV Movie', 'Fantasy', 'Animation', 'Action', 'Thriller', 'Science Fiction', 'Horror']"],
      dtype=object)

In [13]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [14]:
df.corr()

  df.corr()


Unnamed: 0,id,vote_average,vote_count,popularity,budget,revenue,runtime
id,1.0,-0.241569,-0.259859,0.102688,-0.243411,-0.207591,-0.256838
vote_average,-0.241569,1.0,0.253543,0.040162,0.074849,0.149643,0.38844
vote_count,-0.259859,0.253543,1.0,0.069693,0.600121,0.753206,0.288462
popularity,0.102688,0.040162,0.069693,1.0,0.143257,0.148195,0.038973
budget,-0.243411,0.074849,0.600121,0.143257,1.0,0.735239,0.282498
revenue,-0.207591,0.149643,0.753206,0.148195,0.735239,1.0,0.253162
runtime,-0.256838,0.38844,0.288462,0.038973,0.282498,0.253162,1.0


In [15]:
df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
vote_average            float64
vote_count                int64
popularity              float64
budget                    int64
production_companies     object
revenue                   int64
runtime                   int64
dtype: object

In [16]:
data = df.copy()

In [17]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [25]:
action_movies = df[df['genres'].str.contains('Action')] # will output rows with action genre in them

In [23]:
action_movies = df[df['genres'] == 'Action']# will output rows with ONLY action genre in them. - there are none.

In [26]:
action_movies

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130
5,76600,Avatar: The Way of Water,"['Science Fiction', 'Adventure', 'Action']",English,7.7,7853,2280.912,460000000,"['20th Century Studios', 'Lightstorm Entertain...",2319331580,192
6,447365,Guardians of the Galaxy Volume 3,"['Science Fiction', 'Adventure', 'Action']",English,8.3,683,2520.308,250000000,"['Marvel Studios', 'Kevin Feige Productions']",289312702,150
...,...,...,...,...,...,...,...,...,...,...,...
9964,560527,The Dude in Me,"['Fantasy', 'Comedy', 'Action', 'Adventure']",Korean,7.9,113,16.282,0,"['The Contents On', 'EchoFilm', 'Merry Christm...",14651772,122
9967,53894,Lupin the Third: The Secret of Twilight Gemini,"['Animation', 'TV Movie', 'Action', 'Comedy']",Japanese,6.6,38,8.161,0,"['Nippon Television Network Corporation', 'VAP...",0,90
9968,8398,The Hitcher,"['Action', 'Horror', 'Crime', 'Thriller']",English,5.9,810,14.272,10000000,"['Intrepid Pictures', 'Platinum Dunes', 'Focus...",25399945,84
9994,94503,Tsubasa RESERVoir CHRoNiCLE: Tokyo Revelations,"['Fantasy', 'Animation', 'Action', 'Adventure'...",Japanese,7.2,6,7.949,0,['Kodansha'],0,75


In [33]:
from itertools import chain

In [34]:
# Flatten the lists and convert to Series
flattened_series = pd.Series(chain.from_iterable(df['genres']))

# Get unique genres
unique_genres = flattened_series.unique()

print(unique_genres)

['[' "'" 'H' 'o' 'r' ',' ' ' 'M' 'y' 's' 't' 'e' 'T' 'h' 'i' 'l' ']' 'A'
 'c' 'n' 'd' 'v' 'u' 'S' 'F' 'm' 'a' 'C' 'R' 'D' 'W' 'V']
