In [82]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Recommendation System**

Recommendation System is a system that seeks to predict or filter preferences according to the user’s choices. Recommendation systems are utilized in a variety of areas including movies, music, news, books, research articles, search queries, social tags, and products in general. Netflix, Amazon, and other companies use recommendation systems to help their users find the right product or movie for them.

**Content-based Filtering***: 
These suggest recommendations based on the item metadata (movie, product, song, etc). Here, the main idea is if a user likes an item, then the user will also like items similar to it.

**Collaboration-based Filtering**: 
These systems make recommendations by grouping the users with similar interests. For this system, metadata of the item is not required.**

**EXPLORATORY DATA ANALYSIS**

In [83]:
#importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import plotly.express as px
import plotly.graph_objects as go
import ast
from collections import Counter
import nltk
import wordcloud
from wordcloud import WordCloud

In [84]:
# importing datasets
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')

credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [85]:
movies.head(3)

In [86]:
credits.head(3)

In [87]:
print("shape of movies dataset:", movies.shape)
print("shape of credits dataset:", movies.shape)

In [88]:
#joining two datasets
movies = pd.merge(left = movies, right = credits, on='title')

In [89]:
# making a copy of movies datasets, using mocies1 for eda for recommender system
movies1 = movies.copy()

In [90]:
#dropping unncessary columns
movies1 = movies1.drop(columns = ['homepage','tagline','id','overview','status','original_title','movie_id'])

In [91]:
# duartion of the data
movies1['release_date'] = pd.to_datetime(movies1['release_date'])
print(movies1['release_date'].max()-movies1['release_date'].min())

In [92]:
movies1.head(3)

In [93]:
# tidying up genre, productiom_companies adn production countries column
def func(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List    

In [94]:
movies1['production_companies'] = movies1['production_companies'].apply(func)
movies1['production_countries'] = movies1['production_countries'].apply(func)



In [95]:
movies1.head(3)

**time to visualize our data**

In [96]:
genres = Counter()
for i in range(movies1.shape[0]):
    for j in movies1.genres[i]:
        genres[j]+=1
Genres = pd.DataFrame.from_dict(genres, orient='index').reset_index()
Genres = Genres.rename(columns = {'index': 'Genres' ,0: 'Frequency'})

Genres.loc[Genres['Frequency'] < 200, 'Genres'] = 'Others'
fig = px.pie(Genres, values='Frequency', names='Genres',width=800,height=500)
fig.update_layout(
    title="Distribution of Genres",
    legend_title="Genre",
    font=dict(
        size=14
    )
)
fig.layout.template = 'plotly'
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [97]:
# word cloud for genre
wc = WordCloud(background_color = 'white', min_font_size = 10, width = 1600, height=900, margin=5)
wc = wc.fit_words(genres)
plt.imshow(wc)
plt.axis('off')
plt.show()

In [98]:
# top production companies
prod = Counter()
for i in range(movies1.shape[0]):
    for j in movies1.production_companies[i]:
        prod[j]+=1
movie_prod = pd.DataFrame.from_dict(prod, orient='index').reset_index()
movie_prod = movie_prod.rename(columns = {'index': 'Production Company' ,0: 'Frequency'})
movie_prod=movie_prod.sort_values(by = ['Frequency'],ascending=False).reset_index().head(15)
movie_prod.drop(columns='index',axis=0,inplace=True)
movie_prod.style.background_gradient(cmap='RdBu_r')

In [99]:
fig = px.bar( movie_prod, x='Production Company',y='Frequency', color ='Production Company',width=1000, height=650) 
fig.update_layout(
    title="Top 15 Production Companies",
    xaxis_title="Production Companies",
    yaxis_title="Frequency",
    legend_title="Production Companies",
    font=dict(
        size=14
    )
)
fig.layout.template = 'plotly'
fig.show()


In [100]:
popular= movies1[['title', 'popularity']]
popular=popular.sort_values(by='popularity', ascending=False).reset_index().head(15)
popular.drop(columns='index', axis=0, inplace=True)

fig = px.scatter(popular, x='title',y='popularity', color ='popularity',size='popularity',width=950, height=650) 
fig.update_layout(
    title="Top 15 popular movies of all time",
    xaxis_title="Movies",
    yaxis_title="Popularity",
    legend_title="Popularity",
    font=dict(
        size=14
    )
)
fig.layout.template = 'plotly'
fig.show()


In [101]:
lang = movies1.groupby('original_language')[['title']].count()

lang.loc[lang['title'] < 20, 'title'] = 'Others'

fig = px.pie(lang, values='title', names=lang.index, width=800, height=500)
fig.update_layout(
    title="Distribution of Language",
    legend_title="Language",
    font=dict(
        size=14
    )
)
fig.layout.template='plotly'
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [102]:
fig = px.scatter(movies1, y='budget',x='revenue',width=950, height=500) 
fig.update_layout(
    title="No. of movies produced over the years",
    xaxis_title="Revenue",
    yaxis_title="Budget",
    font=dict(
        size=14
    )
)
fig.layout.template = 'plotly'
fig.show()

**movies recommendation system**

In [103]:
movies.head()

In [104]:
# selecting only useful columns
movies2= movies[['id', 'title', 'overview', 'genres', 'cast', 'keywords', 'crew']]
movies.head()



In [105]:
movies2.isnull().sum()

In [106]:
# dropping null values
movies2.dropna(inplace=True)

The movie data is present in the form of lists containing strings, we need to convert the data into a safe and usable structure. Let’s apply the literal_eval() function to the features.

In [107]:
def func(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List    

In [108]:
movies2.head()

In [109]:
def func1(obj):
    List = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3:
            List.append(i['name'])
            counter+=1
        else:
            break
    return List

In [110]:
movies2['cast'] = movies2['cast'].apply(func1)

In [111]:
def func2(obj):
    List = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            List.append(i['name'])
            break
    return List

In [112]:
movies2['crew'] = movies2['crew'].apply(func2)


In [113]:
# splitting the text in the overview column
movies2['overview'] = movies2['overview'].apply(lambda x:x.split()) 

In [114]:
movies2.head()


In [115]:
# applying a transformation to remove spaces between words
movies2['genres'] = movies2['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies2['keywords'] = movies2['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies2['cast'] = movies2['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies2['crew'] = movies2['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [116]:
# making tag column by combining other 5 columns
movies2['tags'] = movies2['overview'] + movies2['genres'] + movies2['keywords'] + movies2['cast'] + movies2['crew']


In [117]:
movies4 = movies2[['id', 'title', 'tags']] 

In [118]:
movies4.head()

In [119]:
pd.options.mode.chained_assignment = None
movies4['tags'] = movies4['tags'].apply(lambda x:" ".join(x))

In [120]:
#lower casing all the alphabets in the tags column
movies4['tags'] = movies4['tags'].apply(lambda x:x.lower())


In [121]:
# text vectoriation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english')


In [122]:
vectors = cv.fit_transform(movies4['tags']).toarray()

In [123]:
# stemming process
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [124]:
# defining stemming function
def stem(text):
    a=[]
    for i in text.split():
        a.append(ps.stem(i))
    return " ".join(a)    

In [125]:
movies4['tags'] = movies4['tags'].apply(stem)

In [126]:
# measuring similarity between movies using cosine distance
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors) 

In [127]:
def recommend_me(movie):
    movie_index = movies4[movies4['title'] == movie].index[0]
    distances =similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(movies4.iloc[i[0]].title)

In [128]:
recommend_me('Batman')

In [129]:
recommend_me('Thor') 
