In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import difflib
path = r'data\netflix_titles.csv'
df = pd.read_csv(path,encoding='latin-1')

In [15]:
# 
col = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description']
df = df[col]
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [16]:
print(f'Total null value: \n{df.isnull().sum()}')
df = df.dropna()


Total null value: 
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [17]:
# make combine feature to extract to vector 
content_feature = ['type','director','description','cast']
df['combined_feature'] = df['type'] + ' ' + \
                         df['director'] + ' ' + \
                         df['description'] + ' ' + \
                         df['cast']
df = df.reset_index(drop=True)

In [None]:
# transfer features to vector 
vectorize = TfidfVectorizer(stop_words='english',max_features=10000)
vector_matrix = vectorize.fit_transform(df['combined_feature'])

In [None]:
# save movie index to indices
df['title_lower'] = df['title'].str.lower().str.strip()
indices = pd.Series(df.index, index=df['title_lower']).drop_duplicates()

In [20]:
def content_based_filter(title,top=5):
    idx = indices[title]

    # calculate cosine similarity
    cosine_sim = linear_kernel(vector_matrix[idx], vector_matrix).flatten()
    sim_score = list(enumerate(cosine_sim))

    #sort similarity
    sim_score_sorted = sorted(sim_score, key=lambda x: x[1], reverse=True)
    sim_score_sorted = sim_score_sorted[1:top+1]
    # get top n movie recommend
    movie_index = [i[0] for i in sim_score_sorted]

    return df.iloc[movie_index][['title','description','director','cast','duration']]

In [None]:

name = input('enter name: ').lower().strip()

# find closest movie name from data that match input
all_titles = df['title_lower'].tolist()
closest_match = difflib.get_close_matches(name,all_titles,n=1,cutoff=0.8)
print(closest_match)

if closest_match:
    print(content_based_filter(closest_match))
else:
    print('cant find movie name')


['strange voices']
                                      title  \
1666  Betty White: First Lady of Television   
433                     The Yeti Adventures   
3304                   The Ryan White Story   
3224         Interview with a Serial Killer   
555             Secret Magic Control Agency   

                                            description  \
1666  This documentary on actress and television pro...   
433   An explorer and a detective set off for the sn...   
3304  After contracting HIV from a tainted blood tre...   
3224  In this jailhouse interview, Arthur Shawcross,...   
555   Hansel and Gretel of fairy tale fame â now a...   

                                 director  \
1666                      Steve Boettcher   
433   Pierre Greco, Nancy Florence Savard   
3304                        John Herzfeld   
3224                   Christopher Martin   
555                     Aleksey Tsitsilin   

                                                   cast duration  
1666 