### PROBLEM STATEMENT
### Build a content based movie recommender system with natural language processing.The function should take movie name as input and should return top 3 recommended movies.

In [175]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [176]:
df=pd.read_csv("NETFLIX TITLES.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81088285,Movie,The Mayo Clinic,"Ken Burns, Christopher Loren Ewers, Erik Ewers",Peter Coyote,United States,"April 19, 2019",2018,TV-14,116 min,Documentaries,A look at how a world-renowned medical institu...
1,81077597,Movie,I Am,Onir,"Juhi Chawla, Rahul Bose, Nandita Das, Sanjay S...","India, Japan","March 4, 2019",2010,TV-MA,106 min,"Dramas, Independent Movies, International Movies",Four individuals in modern India grapple with ...
2,1150871,Movie,Love Jones,Theodore Witcher,"Larenz Tate, Nia Long, Isaiah Washington, Lisa...",United States,"November 1, 2019",1997,R,109 min,"Comedies, Dramas, Independent Movies","In this urban romantic comedy set in Chicago, ..."
3,20077944,Movie,Ghayal,Rajkumar Santoshi,"Sunny Deol, Meenakshi Sheshadri, Amrish Puri, ...",India,"December 31, 2019",1990,TV-14,163 min,"Action & Adventure, Dramas, International Movies","Framed for his older brother's murder, a boxer..."
4,80223779,Movie,Marriage Story,Noah Baumbach,"Scarlett Johansson, Adam Driver, Laura Dern, A...","United States, United Kingdom","December 6, 2019",2019,R,137 min,Dramas,Academy Award-nominated filmmaker Noah Baumbac...


In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4675 entries, 0 to 4674
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       4675 non-null   int64 
 1   type          4675 non-null   object
 2   title         4675 non-null   object
 3   director      3232 non-null   object
 4   cast          4236 non-null   object
 5   country       4326 non-null   object
 6   date_added    4669 non-null   object
 7   release_year  4675 non-null   int64 
 8   rating        4667 non-null   object
 9   duration      4675 non-null   object
 10  listed_in     4675 non-null   object
 11  description   4675 non-null   object
dtypes: int64(2), object(10)
memory usage: 438.4+ KB


In [178]:
## Let's import Rake to extract keyword
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [179]:
# Before we move for feature extraction let's impute the missing values and do some feature engineering
df=df[['title','type','director','rating','listed_in','description']]


In [180]:
df.head()

Unnamed: 0,title,type,director,rating,listed_in,description
0,The Mayo Clinic,Movie,"Ken Burns, Christopher Loren Ewers, Erik Ewers",TV-14,Documentaries,A look at how a world-renowned medical institu...
1,I Am,Movie,Onir,TV-MA,"Dramas, Independent Movies, International Movies",Four individuals in modern India grapple with ...
2,Love Jones,Movie,Theodore Witcher,R,"Comedies, Dramas, Independent Movies","In this urban romantic comedy set in Chicago, ..."
3,Ghayal,Movie,Rajkumar Santoshi,TV-14,"Action & Adventure, Dramas, International Movies","Framed for his older brother's murder, a boxer..."
4,Marriage Story,Movie,Noah Baumbach,R,Dramas,Academy Award-nominated filmmaker Noah Baumbac...


In [181]:
df['Key_words'] = ''
r = Rake()
for index, row in df.iterrows():
    r.extract_keywords_from_text(row['description'])
    key_words_dict_scores = r.get_word_degrees()
    row['Key_words'] = list(key_words_dict_scores.keys())

In [182]:
df['director'] = df['director'].fillna('')

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4675 entries, 0 to 4674
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        4675 non-null   object
 1   type         4675 non-null   object
 2   director     4675 non-null   object
 3   rating       4667 non-null   object
 4   listed_in    4675 non-null   object
 5   description  4675 non-null   object
 6   Key_words    4675 non-null   object
dtypes: object(7)
memory usage: 255.8+ KB


In [184]:
df['listed_in'] = df['listed_in'].map(lambda x: x.split(','))
df['director'] = df['director'].map(lambda x: x.split(','))
for index, row in df.iterrows():
    row['listed_in'] = [x.lower().replace(' ','') for x in row['listed_in']]
    row['director'] = [x.lower().replace(' ','') for x in row['director']]

In [185]:
df['Bag_of_words'] = ''
columns = ['listed_in', 'director','Key_words']
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['Bag_of_words'] = words
    
df = df[['title','Bag_of_words']]

In [187]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['Bag_of_words'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.10814761 ... 0.         0.05129892 0.        ]
 [0.         0.10814761 1.         ... 0.         0.10540926 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.05129892 0.10540926 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [188]:
indices = pd.Series(df['title'])


In [193]:
df.head()

Unnamed: 0,title,Bag_of_words
0,The Mayo Clinic,documentaries kenburns christopherlorenewers e...
1,I Am,dramas independentmovies internationalmovies o...
2,Love Jones,comedies dramas independentmovies theodorewitc...
3,Ghayal,action&adventure dramas internationalmovies ra...
4,Marriage Story,dramas noahbaumbach nominated filmmaker noah b...


In [196]:
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_3_indices = list(score_series.iloc[1:4].index)
    
    for i in top_3_indices:
        recommended_movies.append(list(df['title'])[i])
        
    return recommended_movies

In [199]:
recommend("The Last Exorcism")

['As Above, So Below', 'Lifechanger', '13 Sins']