<a href="https://www.kaggle.com/code/priyanshukr7/movie-recommendation-system?scriptVersionId=178995110" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import The Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud

# Import the Datadet

In [None]:
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv') 

In [None]:
movies.head()

In [None]:
movies.shape

In [None]:
credits.head()

# Merge the Datasets

In [None]:
movies = movies.merge(credits,on='title')

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
movies.isnull().sum()

In [None]:
movies['release_year'] = pd.to_datetime(movies['release_date']).dt.year
movie_counts = movies['release_year'].value_counts().sort_index()
fig = go.Figure(data=go.Bar(x=movie_counts.index, y=movie_counts.values))
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)',  
    font_color='white', 
    title='Number of Movies Released Each Year',  
    xaxis=dict(title='Year'),  
    yaxis=dict(title='Number of Movies')
)
fig.update_traces(marker_color='red')
fig.show()

# Most common words tn titles

In [None]:
titles = movies['original_title'].values

text = ' '.join(titles)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Titles', color='white')
plt.show()

# Choose participating columns

In [None]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.head()

# Extract the useful contents from the columns

In [None]:
import ast
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [None]:
movies.dropna(inplace=True)

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies.head()

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head()

In [None]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [None]:
movies['cast'] = movies['cast'].apply(convert3)
movies.head()

In [None]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head()

# Join the words

In [None]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [None]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [None]:
movies.head()

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies.head()

# Join all the participating columns

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [None]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

# Stemming

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y =[]
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [None]:
new['tags'] = new['tags'].apply(stem)

# Feature Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new['tags']).toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

In [None]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

In [None]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [None]:
recommend('Batman Begins')

# Exporting the desired files

In [None]:
import pickle

In [None]:
pickle.dump(new.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))