In [1]:
import pandas as pd
import numpy as np
import ast as a         # ast literal_eval() is used
from sklearn.feature_extraction.text import CountVectorizer         # For the vectorisation of Tags
from sklearn.metrics.pairwise import cosine_similarity
#import nltk                                                                              (Natural language Processing library)
from nltk.stem.porter import PorterStemmer                    # For the deletion of same values

In [2]:
movie=pd.read_csv("tmdb_5000_movies.csv")
credits=pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movie.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [4]:
credits.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


# Merge the Both Databases

In [5]:
df=movie.merge(credits,on="title",how="left")
df.shape

(4809, 23)

In [6]:
df.head(1).columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

# Choose only selective Columns

In [7]:
df=df[["id","title","genres","keywords","original_language","overview","cast","crew"]]
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
df.isnull().sum()

id                   0
title                0
genres               0
keywords             0
original_language    0
overview             3
cast                 0
crew                 0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

id                   0
title                0
genres               0
keywords             0
original_language    0
overview             0
cast                 0
crew                 0
dtype: int64

In [11]:
s=df.iloc[0,2]
s                                 # this is a string

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

# Convert sting into list

In [12]:
s=a.literal_eval(s)           # literal_eval() function is use to gave the string value in to the list
s

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [13]:
def convert(obj):
    l=[]
    obj=a.literal_eval(obj)
    for i in obj:
        l.append(i["name"])
    return l

In [14]:
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

# Use the Function on the column

In [15]:
df["genres"].apply(convert)                           # apply() will apply the Function created to all the Columns

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [16]:
df["genres"]=df["genres"].apply(convert)
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [17]:
df["keywords"]=df["keywords"].apply(convert)
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [18]:
s=df.iloc[0,6]

In [19]:
def cast_name(obj):
    l=[]
    c=1
    obj=a.literal_eval(obj)
    for i in obj:
        if c<=3:
            l.append(i["name"])
            c+=1
        else:
            break
    return l        

In [20]:
cast_name(s)

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [21]:
df["cast"]=df["cast"].apply(cast_name)
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [22]:
s=df["crew"][0]

In [23]:
def director(obj):
    l=[]
    obj=a.literal_eval(obj)
    for i in obj:
        if i["job"]=="Director":
            l.append(i["name"])
    return l

In [24]:
director(s)

['James Cameron']

In [25]:
df["crew"]=df["crew"].apply(director)
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


# Make the overview into a list

In [26]:
s=df["overview"][0]
z=lambda x:x.split()
z(s)

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [27]:
df["overview"]=df["overview"].apply(lambda x: x.split())
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


# Now Delete all the spaces in all for better Search

In [28]:
df["genres"]=df["genres"].apply(lambda x: [i.replace(" ","")for i in x])
df["keywords"]=df["keywords"].apply(lambda x: [i.replace(" ","")for i in x])
df["cast"]=df["cast"].apply(lambda x: [i.replace(" ","")for i in x])
df["crew"]=df["crew"].apply(lambda x: [i.replace(" ","")for i in x])
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [29]:
df["original_language"]=df["original_language"].apply(lambda x: x.split())
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[en],"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [30]:
df["title1"]=df["title"].apply(lambda x: x.split())
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew,title1
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[en],"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],[Avatar]


In [31]:
df["tags"]= df["overview"]+ df["genres"]+ df["keywords"]+ df["cast"]+ df["crew"]+ df["original_language"]+ df["title1"]
df.head(1)

Unnamed: 0,id,title,genres,keywords,original_language,overview,cast,crew,title1,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[en],"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],[Avatar],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [32]:
df=df[["id","title","tags"]]
df.head(1)

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


# Make that list string again

In [33]:
df["tags"]=df["tags"].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Make tags lower case (Recommended)

In [34]:
df["tags"]=df["tags"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


# Matchine Learning

# Vectors
## Text Vectorisation

# Make the Rule for the tags

In [35]:
cv=CountVectorizer(max_features=6000,stop_words="english")   
cv

# Steming

In [36]:
ps=PorterStemmer()                   # PorterStreammer() is a function in Further in stem use

In [37]:
def con_to_stem(obj):
    l=[]
    obj=obj.split()
    for i in obj:
        l.append(ps.stem(i))                    # "stem()" Function is PorterStreammer Part use to remove conwaying values
        
    return " ".join(l)

In [38]:
df["tags"]=df["tags"].apply(con_to_stem)
df.head(1)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."


# Change the tags into combine string and gave Vectors

In [39]:
vector=cv.fit_transform(df["tags"]).toarray() #"fit_transform()" is CountVectorizer takes column and change a string into vector

In [40]:
vector.shape

(4806, 6000)

# Check for the Occuring tags

In [41]:
cv.get_feature_names_out().tolist()    # "get_feature_names_out()" is a CountVectorizer function to see the most occuring values

['000',
 '007',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '150',
 '16',
 '17',
 '17th',
 '18',
 '1890',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1941',
 '1944',
 '1950',
 '1950s',
 '1955',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1990',
 '1995',
 '1997',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2000',
 '2001',
 '2003',
 '2009',
 '20th',
 '21',
 '21st',
 '23',
 '24',
 '25',
 '27',
 '28',
 '30',
 '300',
 '3d',
 '40',
 '47',
 '50',
 '500',
 '51',
 '60',
 '60s',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandon',
 'abbi',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'aborigin',
 'abov',
 'abroad',
 'absolut',
 'absurd',
 'abus',
 'academ',
 'academi',
 'academy',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 '

# Check for Similarities

In [42]:
similar=cosine_similarity(vector)

In [43]:
similar

array([[1.        , 0.09756098, 0.10713432, ..., 0.06205716, 0.02068572,
        0.02145212],
       [0.09756098, 1.        , 0.08035074, ..., 0.06205716, 0.04137144,
        0.04290423],
       [0.10713432, 0.08035074, 1.        , ..., 0.04543109, 0.02271554,
        0.02355714],
       ...,
       [0.06205716, 0.06205716, 0.04543109, ..., 1.        , 0.05263158,
        0.0727754 ],
       [0.02068572, 0.04137144, 0.02271554, ..., 0.05263158, 1.        ,
        0.09096926],
       [0.02145212, 0.04290423, 0.02355714, ..., 0.0727754 , 0.09096926,
        1.        ]])

In [44]:
sorted(list(enumerate(similar[0])),reverse=True,key=lambda x: x[1])[0:11]          # enumerate() Function is to assign the indexin tuple to vector

[(0, 1.0000000000000004),
 (2405, 0.3187883565316691),
 (1214, 0.28994568501612616),
 (3728, 0.26637086328481074),
 (539, 0.2602896031476768),
 (1192, 0.2566196768304904),
 (507, 0.24990759563535758),
 (1202, 0.24055570926238246),
 (61, 0.24044523591131534),
 (582, 0.23800905112740575),
 (778, 0.23597327607707924)]

In [45]:
def movie_reco(movie):
    movie_index=df[df["title"]==movie].index[0]
    distance=similar[movie_index]
    movie_list=sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])[1:11]
    
    for i in movie_list:
        print(df.iloc[i[0]]["title"])

In [46]:
movie_reco("Batman")

Batman
Batman & Robin
Batman Returns
Batman Begins
The Dark Knight
The Dark Knight Rises
Batman Forever
Batman: The Dark Knight Returns, Part 2
The R.M.
Punisher: War Zone


In [54]:
movie_index=df[df["title"]=="Avatar"].index[0]
movie_index

0

In [48]:
df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


# To gave the list of movie to the App

In [49]:
import pickle as p
f=open("Movie_pkl","wb")
p.dump(df,f)
f.close()

# To gave similar data to the App

In [50]:
similar

array([[1.        , 0.09756098, 0.10713432, ..., 0.06205716, 0.02068572,
        0.02145212],
       [0.09756098, 1.        , 0.08035074, ..., 0.06205716, 0.04137144,
        0.04290423],
       [0.10713432, 0.08035074, 1.        , ..., 0.04543109, 0.02271554,
        0.02355714],
       ...,
       [0.06205716, 0.06205716, 0.04543109, ..., 1.        , 0.05263158,
        0.0727754 ],
       [0.02068572, 0.04137144, 0.02271554, ..., 0.05263158, 1.        ,
        0.09096926],
       [0.02145212, 0.04290423, 0.02355714, ..., 0.0727754 , 0.09096926,
        1.        ]])

In [51]:
"""f=open("Similarity_vector","wb")
p.dump(similar,f)
f.close()"""

'f=open("Similarity_vector","wb")\np.dump(similar,f)\nf.close()'

In [55]:
similar[movie_index]

array([1.        , 0.09756098, 0.10713432, ..., 0.06205716, 0.02068572,
       0.02145212])

In [59]:
arr=np.array([[1,2],[3,4],[5,6]])
a=list(enumerate(arr))
a

[(0, array([1, 2])), (1, array([3, 4])), (2, array([5, 6]))]

In [70]:
x=lambda i,j: i==j
np.fromfunction(x,(3,3))

array([[ True, False, False],
       [False,  True, False],
       [False, False,  True]])