### Import all Dependencies

In [1]:
import pandas as pd
import numpy as np
import ast

### Add CSV Files
#### -'tmdb_5000_movies.csv'
#### -'tmdb_5000_credits.csv'

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# Concatinate both CSV Files into One . . .

In [5]:
movies = movies.merge(credits, on='title')

In [6]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
movies.shape

(4809, 23)

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

### Keep all those columns which are useful for analysis!

In [9]:
#Which Column do we need 🤔?
#- Genres
#- Id
#- Keywords
#- Title
#- Overview
#- Cast
#- Crew

In [10]:
movies = movies[['genres', 'id', 'keywords', 'overview', 'title', 'cast', 'crew']]

In [11]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [137]:
movies.shape

(4806, 8)

In [12]:
movies.isnull().sum()

genres      0
id          0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

In [13]:
movies.dropna(inplace=True)

In [14]:
movies.duplicated().sum()

0

### Converting All Columns From Dictionary to List
### 1. Genres

In [15]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [16]:
# Convert genres from dictionary to list
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L 

In [17]:
movies['genres'] = movies['genres'].apply(convert)

In [18]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### 2. Keywords

In [19]:
movies.iloc[2].keywords

'[{"id": 470, "name": "spy"}, {"id": 818, "name": "based on novel"}, {"id": 4289, "name": "secret agent"}, {"id": 9663, "name": "sequel"}, {"id": 14555, "name": "mi6"}, {"id": 156095, "name": "british secret service"}, {"id": 158431, "name": "united kingdom"}]'

In [20]:
movies['keywords'] = movies['keywords'].apply(convert)

In [21]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### 3. Cast

In [22]:
def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L 

In [23]:
movies['cast'] = movies['cast'].apply(convert_cast) 

In [24]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [25]:
def fetch_dicrector(val):
    L = []
    for i in ast.literal_eval(val):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [26]:
# Fecting Director Name from The Crew
movies['crew'] = movies['crew'].apply(fetch_dicrector)

In [27]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


### 4. Overview

In [28]:
# Converting Into List from String :
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [29]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


### Removing All Spaces from these columns
- By Removing The Spaces, it would be more helpful when the system is finding movie but not getting the exact output...
- In That Situation Removing Space is very helpful to find the movie's tags!

In [30]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [31]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


### Creating New DataFrame & Removing Remaining Columns

In [32]:
movies['tag'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew']

In [33]:
movies.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew,tag
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [35]:
new_df = movies[['id', 'title', 'tag']]

In [40]:
# Change Tag from List to String!
new_df['tag'] = new_df['tag'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(lambda x:" ".join(x))


In [41]:
new_df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [44]:
# Make Tags into Lowercase
new_df['tag'] = new_df['tag'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(lambda x:x.lower())


### The New DataFram [New_DF]

In [45]:
new_df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [46]:
new_df['tag'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver jamescameron'

In [47]:
new_df['tag'][1]

'captain barbossa, long believed to be dead, has come back to life and is headed to the edge of the earth with will turner and elizabeth swann. but nothing is quite as it seems. adventure fantasy action johnnydepp orlandobloom keiraknightley goreverbinski'

### Removing Duplicate Words and Plural Words from the 'Tags'

In [60]:
import nltk

In [61]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [69]:
def stemmer(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [70]:
new_df['tag'] = new_df['tag'].apply(stemmer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(stemmer)


In [71]:
new_df['tag'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict samworthington zoesaldana sigourneyweav jamescameron'

### Removing Stop Words from the Tags

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [74]:
vectors = cv.fit_transform(new_df['tag']).toarray()

In [75]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [76]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [78]:
cv.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '12th',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '17th',
 '18',
 '1890',
 '18th',
 '19',
 '1920',
 '1930',
 '1930s',
 '1937',
 '1940',
 '1941',
 '1944',
 '1945',
 '1950',
 '1950s',
 '1955',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1990',
 '1994',
 '1997',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '2003',
 '2009',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandon',
 'abbi',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'abov',
 'abroad',
 'absolut',
 'abus',
 'academ',
 'academi',
 'academy',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'acquaint',
 'acquir',
 'act',
 'action',
 'activ',
 

In [79]:
vectors.shape

(4806, 5000)

### Finding Similarities using Cosine_Similarity

In [80]:
# Now Measuring Cosine Distance
# Using Cosine Similarities
from sklearn.metrics.pairwise import cosine_similarity

In [83]:
cosine_similarity(vectors).shape

(4806, 4806)

In [84]:
similarity = cosine_similarity(vectors)

In [88]:
similarity[0]

array([1.        , 0.13957263, 0.08206099, ..., 0.        , 0.        ,
       0.        ])

In [102]:
sorted(list(enumerate(similarity[6])), reverse=True, key=lambda x:x[1])

[(6, 1.0),
 (2315, 0.19090088708030312),
 (1990, 0.15152288168283162),
 (255, 0.14920941939059815),
 (2645, 0.14754222271266348),
 (2786, 0.14754222271266348),
 (39, 0.14699878726500756),
 (67, 0.14699878726500756),
 (506, 0.14285714285714285),
 (1438, 0.14285714285714285),
 (55, 0.14175398238766682),
 (2402, 0.1349873117890097),
 (393, 0.13263952726932277),
 (330, 0.13109470506889243),
 (182, 0.13071306028508972),
 (251, 0.1292191476761844),
 (2925, 0.12777531299998798),
 (2970, 0.12777531299998798),
 (734, 0.1263788195613404),
 (4264, 0.12626906806902632),
 (54, 0.12371791482634839),
 (3764, 0.12073632210407378),
 (615, 0.11915109374689986),
 (3907, 0.11587244366483038),
 (160, 0.11470786693528087),
 (34, 0.11454053224818188),
 (194, 0.11454053224818188),
 (256, 0.11454053224818188),
 (381, 0.11454053224818188),
 (2278, 0.11454053224818188),
 (86, 0.11437725271791938),
 (1626, 0.11428571428571428),
 (106, 0.1129384878631564),
 (1600, 0.1129384878631564),
 (2686, 0.1129384878631564),


In [95]:
sorted(list(enumerate(similarity[2])), reverse=True, key=lambda x:x[1])[1:6]

[(11, 0.3442651863295482),
 (1346, 0.3285091522175248),
 (4077, 0.30242156957551825),
 (29, 0.29643243059996244),
 (3166, 0.2501595914621521)]

In [114]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [143]:
recommend('Iron Man')

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Helix... Loaded
Teenage Mutant Ninja Turtles II: The Secret of the Ooze


In [136]:
for i in new_df['title'] == 'The Wolverine':
    if i == 1:
        print(new_df.index[i])

[[   0    1    2 ... 4806 4807 4808]]


### Dumping The Data in pkl format!

In [138]:
import pickle

In [140]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [144]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))