In [None]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim.models import Word2Vec
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
credits=pd.read_csv(r"C:\Users\kannu\OneDrive\Desktop\DataScience\Projects\Movie_Recommender_System\datasets\tmdb_5000_credits.csv")
movies_data=pd.read_csv(r"C:\Users\kannu\OneDrive\Desktop\DataScience\Projects\Movie_Recommender_System\datasets\tmdb_5000_movies.csv")

In [None]:
credits.head()

In [None]:
movies_data.head()

### Merging both

In [None]:
combined=movies_data.merge(credits,on='title')

In [None]:
movies_data.shape

In [None]:
credits.shape

In [None]:
combined.shape

### Selecting only required features

In [None]:
combined.info()

In [None]:
movies=combined[['id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.head()

In [None]:
movies.isna().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
#from genre we need only name
#from keywords also we need only name
#from cast we will pick top 3 character actual name
#from crew we will pick director


In [None]:
def name_extracter(data):
    list_dictionary=json.loads(data)
    
    
    name_list=[]
    for dictionary in list_dictionary:
        name_list.append(dictionary['name'].replace(" ",""))
    
    return " ".join(name_list)

movies['keywords']=movies['keywords'].apply(name_extracter)


In [None]:
movies['genres']=movies['genres'].apply(name_extracter)

In [None]:
def top_3_actors(data):
    List_dict=json.loads(data)
    count=0
    name=[]
    for dictionary in List_dict:
        if count < 3:
            name.append(dictionary['name'].replace(" ",""))
            count +=1
        else:
            break
    return " ".join(name)
            

In [None]:
movies['cast']=movies['cast'].apply(top_3_actors)

In [None]:
def director(data):
    list_dict=json.loads(data)
    
    for dic in list_dict:
        if dic['job'].lower().__contains__("director"):
            return dic['name'].replace(" ","")
        
            


In [None]:
movies['crew']=movies['crew'].apply(director)

In [None]:
movies

In [None]:
movies["tags"]=movies["overview"]+" "+movies["genres"]+" "+movies["keywords"]+" "+movies["cast"]+" "+movies["crew"]

In [None]:
stopwords_list=stopwords.words('english')

In [None]:
#Accented character handing

def accented(data):
    return unidecode(str(data))

def white_space(data):
    import re
    return re.sub(r"\s+"," ",data)

movies_accented=movies['tags'].apply(accented)
movies_white_space=movies_accented.apply(white_space)

In [None]:
#cleaning
def clean(data):
    res=[word.lower() for word in word_tokenize(data) if (word not in punctuation) and (word.lower() not in stopwords_list)]
    return " ".join(res)

movies_cleaned=movies_white_space.apply(clean)

In [None]:
#Performing Lemmatization

def stem(data):
    words=[]
    lemma=WordNetLemmatizer()
    for word in word_tokenize(data):
        words.append(lemma.lemmatize(word,"v"))
    return " ".join(words)

movies_stem=movies_cleaned.apply(stem)

In [None]:
movies_stem

In [None]:
#Using Count Vectorizer for word embedding

In [None]:
cv=CountVectorizer(max_features=2000,max_df=.95)
cv_data=cv.fit_transform(movies_stem).A

In [None]:
cv_data.shape

In [None]:
#Using TFIDF for word embedding
tfidf=TfidfVectorizer(max_features=2000,max_df=.95)
tfidf_data=tfidf.fit_transform(movies_stem).A

In [None]:
tfidf_data

In [None]:
# Using Cosine similarity to find top 5 similar movies

In [None]:
cv_similar=cosine_similarity(cv_data)

In [None]:
#cousine similarity with tfidf

tfidf_similar=cosine_similarity(tfidf_data)

In [None]:
cv_similar.shape

In [None]:
#means each vectors (movies) similarity with each one
#At index 0, 0 index movie similary with other movies in each col
# suppose we have to find 0 index movies 5 similar movies, for that we will short and will extract top 5 having most values.

In [None]:
#Recoomender using CountVectorizer

def recommender_cv(movie_title):
    index=movies[movies["title"]==movie_title].index[0]
    similarity=list(enumerate(cv_similar[index]))
    similar_5=sorted(similarity,reverse=True,key=lambda x:x[1])[1:6]
    for data in similar_5:
        print(movies['title'][data[0]])
          


    

In [None]:
#Recommender using tfidf
def recommender_tfidf(movie_title):
    index=movies[movies["title"]==movie_title].index[0]
    similarity=list(enumerate(tfidf_similar[index]))
    similar_5=sorted(similarity,reverse=True,key=lambda x:x[1])[1:6]
    for data in similar_5:
        print(movies['title'][data[0]])
          

In [None]:
recommender_cv('Batman')

In [None]:
recommender_tfidf('Batman')

In [None]:
#