In [103]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [150]:
movies_df = pd.read_csv('action.csv', encoding="utf-8")
movies_df.head()


Unnamed: 0,movie_id,movie_name,year,certificate,runtime,genre,rating,description,director,director_id,star,star_id,votes,gross(in $)
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,161 min,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,/name/nm3363032/,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...","/name/nm4004793/,/name/nm2143282/,/name/nm1775...",204835.0,
1,tt1630029,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,/name/nm0000116/,"Sam Worthington, \nZoe Saldana, \nSigourney We...","/name/nm0941777/,/name/nm0757855/,/name/nm0000...",295119.0,
2,tt5884796,Plane,2023,R,107 min,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,/name/nm0724938/,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...","/name/nm0124930/,/name/nm1591496/,/name/nm0001...",26220.0,
3,tt6710474,Everything Everywhere All at Once,2022,R,139 min,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",/name/nm3453283/,"Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...","/name/nm3215397/,/name/nm0000706/,/name/nm3513...",327858.0,
4,tt5433140,Fast X,2023,,,"Action, Crime, Mystery",,Dom Toretto and his family are targeted by the...,Louis Leterrier,/name/nm0504642/,"Vin Diesel, \nJordana Brewster, \nTyrese Gibso...","/name/nm0004874/,/name/nm0108287/,/name/nm0879...",,


# Cleaning Data

#checking for missing values
movies_df.isnull()

They are some missing values. For simplicity of this assignment, I am dropping all missing values.

In [27]:
movies_df_dropped = movies_df.dropna()
#checking to see if they are any missing values still.
movies_df_dropped.isnull().values.any()

False

In [29]:
duplicates = movies_df_dropped.duplicated()
duplicates

12       False
14       False
15       False
20       False
23       False
         ...  
33154    False
33808    False
37565    False
38799    False
39178    False
Length: 2672, dtype: bool

They are no duplicates. All good there.

In [155]:
movies_df_dropped.shape

(2672, 14)

This is a small dataset but per instructions, Im gonna make it smaller to about 500 rows and drop some columns for simplicity.

In [157]:
movies = movies_df_dropped.iloc[:500]
movies.shape

(500, 14)

In [161]:
movies = movies.drop(['movie_id','certificate','runtime','rating','director','director_id','star','star_id','votes','gross(in $)'],axis=1)
movies

Unnamed: 0,movie_name,year,genre,description
12,Black Panther,2018,"Action, Adventure, Sci-Fi","T'Challa, heir to the hidden but advanced king..."
14,Top Gun,1986,"Action, Drama",As students at the United States Navy's elite ...
15,Avatar,2009,"Action, Adventure, Fantasy",A paraplegic Marine dispatched to the moon Pan...
20,The Hunger Games,2012,"Action, Adventure, Sci-Fi",Katniss Everdeen voluntarily takes her younger...
23,Dune,2021,"Action, Adventure, Drama",A noble family becomes embroiled in a war for ...
...,...,...,...,...
668,Safe House,2012,"Action, Thriller",A young CIA agent is tasked with looking after...
670,Godzilla,2014,"Action, Adventure, Sci-Fi",The world is beset by the appearance of monstr...
671,The Matrix Reloaded,2003,"Action, Sci-Fi","Freedom fighters Neo, Trinity and Morpheus con..."
673,Hellboy,2004,"Action, Adventure, Fantasy",A demon raised from infancy after being conjur...


I've gotten the dataset down to movie_name, year, genre and description. It is simple enough to carry out this content-based recommendation system.

In [163]:
duplicates = movies[movies.duplicated(subset='movie_name', keep=False)]
duplicates

Unnamed: 0,movie_name,year,genre,description
23,Dune,2021,"Action, Adventure, Drama",A noble family becomes embroiled in a war for ...
67,The Mummy,1999,"Action, Adventure, Fantasy",At an archaeological dig in the ancient city o...
98,Oldboy,2003,"Action, Drama, Mystery",After being kidnapped and imprisoned for fifte...
235,Point Break,1991,"Action, Crime, Thriller",An F.B.I. Agent goes undercover to catch a gan...
244,Red Dawn,1984,"Action, Drama, Thriller",It is the dawn of World War III. In the west m...
256,Dune,1984,"Action, Adventure, Sci-Fi",A Duke's son leads desert warriors against the...
289,Oldboy,2013,"Action, Drama, Mystery","Obsessed with vengeance, a man sets out to fin..."
395,The Mummy,2017,"Action, Adventure, Fantasy",An ancient Egyptian princess is awakened from ...
431,The Magnificent Seven,1960,"Action, Adventure, Western",Seven gunfighters are hired by Mexican peasant...
467,The Magnificent Seven,2016,"Action, Adventure, Western",Seven gunmen from a variety of backgrounds are...


Ran into an issue here, since movie_name is gonna be the output for my recommendation system, having duplicate titles may be confusing. Therefore, I will need a unique identifer... My plan is to add the year to the title to differenitate any movies of the same name. 

In [165]:
# adding the year to the movie_name of duplicate movie_name to provide some uniqueness 
movies['movie_name'] = movies.apply(lambda row: f"{row['movie_name']} ({row['year']})" if row['movie_name'] in duplicates['movie_name'].values else 
row['movie_name'],axis=1)


In [167]:
duplicates

Unnamed: 0,movie_name,year,genre,description
23,Dune,2021,"Action, Adventure, Drama",A noble family becomes embroiled in a war for ...
67,The Mummy,1999,"Action, Adventure, Fantasy",At an archaeological dig in the ancient city o...
98,Oldboy,2003,"Action, Drama, Mystery",After being kidnapped and imprisoned for fifte...
235,Point Break,1991,"Action, Crime, Thriller",An F.B.I. Agent goes undercover to catch a gan...
244,Red Dawn,1984,"Action, Drama, Thriller",It is the dawn of World War III. In the west m...
256,Dune,1984,"Action, Adventure, Sci-Fi",A Duke's son leads desert warriors against the...
289,Oldboy,2013,"Action, Drama, Mystery","Obsessed with vengeance, a man sets out to fin..."
395,The Mummy,2017,"Action, Adventure, Fantasy",An ancient Egyptian princess is awakened from ...
431,The Magnificent Seven,1960,"Action, Adventure, Western",Seven gunfighters are hired by Mexican peasant...
467,The Magnificent Seven,2016,"Action, Adventure, Western",Seven gunmen from a variety of backgrounds are...


In [169]:
movies.head()

Unnamed: 0,movie_name,year,genre,description
12,Black Panther,2018,"Action, Adventure, Sci-Fi","T'Challa, heir to the hidden but advanced king..."
14,Top Gun,1986,"Action, Drama",As students at the United States Navy's elite ...
15,Avatar,2009,"Action, Adventure, Fantasy",A paraplegic Marine dispatched to the moon Pan...
20,The Hunger Games,2012,"Action, Adventure, Sci-Fi",Katniss Everdeen voluntarily takes her younger...
23,Dune (2021),2021,"Action, Adventure, Drama",A noble family becomes embroiled in a war for ...


We can see row 23 is an example that how function worked. Now, let's drop the year column.

In [185]:
movies = movies.drop(['year'],axis=1)
movies = movies.drop(['genre'],axis=1)
movies

Unnamed: 0,movie_name,description
12,Black Panther,t challa heir to the hidden but advanced kingd...
14,Top Gun,as students at the united states navy s elite ...
15,Avatar,a paraplegic marine dispatched to the moon pan...
20,The Hunger Games,katniss everdeen voluntarily takes her younger...
23,Dune (2021),a noble family becomes embroiled in a war for ...
...,...,...
668,Safe House,a young cia agent is tasked with looking after...
670,Godzilla,the world is beset by the appearance of monstr...
671,The Matrix Reloaded,freedom fighters neo trinity and morpheus cont...
673,Hellboy (2004),a demon raised from infancy after being conjur...


The description column along with the other columns need to be standardize, so I will clean the text to ensure it's all in lowercase and there are no punctuations.

In [173]:
def standardized_column_text(column_text):
    column_text = column_text.lower()
    column_text = ''.join([char if char not in string.punctuation else ' ' for char in column_text])
    column_text = ' '.join(column_text.split())
    return column_text

In [177]:
movies['description'] = movies['description'].apply(standardized_column_text)
movies.description

12     t challa heir to the hidden but advanced kingd...
14     as students at the united states navy s elite ...
15     a paraplegic marine dispatched to the moon pan...
20     katniss everdeen voluntarily takes her younger...
23     a noble family becomes embroiled in a war for ...
                             ...                        
668    a young cia agent is tasked with looking after...
670    the world is beset by the appearance of monstr...
671    freedom fighters neo trinity and morpheus cont...
673    a demon raised from infancy after being conjur...
674    bound by a shared destiny a teen bursting with...
Name: description, Length: 500, dtype: object

In [187]:
movies 

Unnamed: 0,movie_name,description
12,Black Panther,t challa heir to the hidden but advanced kingd...
14,Top Gun,as students at the united states navy s elite ...
15,Avatar,a paraplegic marine dispatched to the moon pan...
20,The Hunger Games,katniss everdeen voluntarily takes her younger...
23,Dune (2021),a noble family becomes embroiled in a war for ...
...,...,...
668,Safe House,a young cia agent is tasked with looking after...
670,Godzilla,the world is beset by the appearance of monstr...
671,The Matrix Reloaded,freedom fighters neo trinity and morpheus cont...
673,Hellboy (2004),a demon raised from infancy after being conjur...


# Building Vectors

In [195]:
#Using TF-IDF to convert text to numerical vectors
tfidf_vectors = TfidfVectorizer(stop_words="english")

tfidf_features = tfidf_vectors.fit_transform(movies['description'])


# Computing similarity scores

I'll now use cosine similarity to find the angle between each vector to see how similar each movie descriptions are to each other. 

In [201]:
cosine_simi = cosine_similarity(tfidf_features,tfidf_features)

Now, in order to take the user's input, a function is the best implementation.

# Recommendation System in action

In [209]:
def movie_recommendation_system(input_data,tfidf_vectors=tfidf_vectors,cosine_simi=cosine_simi):
    # converting user input to vectors
    input_data_vectors = tfidf_vectors.transform([input_data])
    #compare similiarity between user's description and descriptions in dataset
    input_simi_scores = cosine_similarity(input_data_vectors, tfidf_features)

    simi_scores = list(enumerate(input_simi_scores[0]))
    simi_scores = sorted(simi_scores,key=lambda x:x[1], reverse=True)
    simi_scores = simi_scores[1:6]
    movie_index = [i[0] for i in simi_scores]
    return movies['movie_name'].iloc[movie_index]
    

    

In [211]:
user_input = "I love thrilling action movies set in space, with a comedic twist."
print(movie_recommendation_system(user_input))

291    George of the Jungle
132                  Aliens
272         The Incredibles
428        Charlie's Angels
191                 Shooter
Name: movie_name, dtype: object
