## Content Based Recommeder System

### Exploratory data analysis

In [None]:
import numpy as np
import pandas as pd

In [None]:
# reading the dataset
movies = pd.read_csv('./DATA/tmdb_5000_movies.csv')
credits = pd.read_csv('./DATA/tmdb_5000_credits.csv')
print(movies.shape)
movies.head(2)

In [None]:
print(credits.shape)
credits.head(2)

In [None]:
movies = movies.merge(credits,on='title')
print(movies.shape)
movies.head(2)

In [None]:
# Keeping only useful columns for the recommendation
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
print(movies.shape)
movies.head(2)

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)
movies.isnull().sum()

In [None]:
# handling genres
movies.iloc[0]['genres']

In [None]:
# converting str to list
import ast
def convert(text):
  L = []
  for i in ast.literal_eval(text):
    L.append(i['name'])
  return L


In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies.head()

In [None]:
# handling keywords
movies.iloc[0]['keywords']

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head()

In [None]:
# handling cast
movies.iloc[0]['cast']

In [None]:
# Keeping only the top 3 cast
def convert_cast(text):
  L = []
  counter = 0
  for i in ast.literal_eval(text):
    if counter < 3:
      L.append(i['name'])
    counter+=1
  return L

  

In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)
movies.head()

In [None]:
# handling crew
movies.iloc[0]['crew']

In [None]:
def fetch_director(text):
  L = []
  for i in ast.literal_eval(text):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head(2)

In [None]:
# handling overview (converting to list)
movies.iloc[0]['overview']

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies.sample(2)

In [None]:
movies.iloc[0]['overview']

In [None]:
# Removing space
# Example
'Kendrick Lamar'
'KendrickLamar'

def remove_space(L):
  L1 = []
  for i in L:
    L1.append(i.replace(" ",""))
  return L1

In [None]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
movies.sample(2)

In [None]:
# concatenarting all to have tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.sample(2)

In [None]:
movies.iloc[0]['tags']

In [None]:
# droping the extra columns
new_movies = movies[['movie_id','title','tags']]
new_movies.sample(3)

In [None]:
# converting the list to str
new_movies['tags'] = new_movies['tags'].apply(lambda x: " ".join(x))
new_movies.head()

In [None]:
# converting to lower case
new_movies['tags'] = new_movies['tags'].apply(lambda x:x.lower())
new_movies.head()

### Trainng and Model

In [None]:
new_movies.iloc[0]['tags']

In [None]:
import nltk
from nltk.stem import PorterStemmer

In [None]:
ps = PorterStemmer()
def stems(text):
  T = []
  
  for i in text.split():
    T.append(ps.stem(i))
    
  return " ".join(T)


In [None]:
new_movies['tags'] = new_movies['tags'].apply(stems)
new_movies.iloc[0]['tags']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
vector = cv.fit_transform(new_movies['tags']).toarray()
vector[0]

In [None]:
print(vector.shape)
print(len(cv.get_feature_names_out()))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity.shape

In [None]:
new_movies[new_movies['title'] == 'The Lego Movie'].index[0]

In [None]:
def recommend(movie):
  index = new_movies[new_movies['title'] == movie].index[0]
  distances = sorted(list(enumerate(similarity[index])),reverse=True,key= lambda x:x[1])
  for i in distances[1:6]:
    print(new_movies.iloc[i[0]].title)

In [None]:
recommend('Spider-Man 2')

In [None]:
import pickle
pickle.dump(new_movies,open('artifacts/movie_list.pkl','wb'))
pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))