In [1]:
import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movie_df = pd.read_csv('data/movie_df_merged_all.csv')

In [3]:
movie_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,title,runtime,release_date,overview,genres,keywords,popularity,vote_average,vote_count,production_countries,cast,crew,poster_path
0,0,19995,Avatar,162.0,2009-12-10,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",150.437577,7.2,11800,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",/kyeqWdyUXW608qlYkRqosgbbJyK.jpg
1,1,285,Pirates of the Caribbean: At World's End,169.0,2007-05-19,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",139.082615,6.9,4500,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg


In [4]:
# New columns -----> release_year
movie_df['release_year'] = movie_df['release_date'].str.split('-').str[0]

In [5]:
movie_df.drop(columns=['release_date'], inplace=True)

In [6]:
# overview ------------> Handle missing value
movie_df.dropna(subset=['overview'], inplace = True)

In [7]:
# overview ------------> convert to List
movie_df.loc[:,'overview'] = movie_df['overview'].apply(lambda x: x.split())

In [8]:
# genres ---------------> Handle missing value
movie_df.dropna(subset=['genres'], inplace = True)

In [9]:
movie_df['genres'].isnull().sum()

0

In [10]:
# ----->   genres function
def convert(obj):
    L = []
    for i in json.loads(obj):
        L.append(i['name'])
    return L

In [11]:
# genres ------------> convert to List
movie_df.loc[:,'genres'] = movie_df['genres'].apply(convert)

In [12]:
# keywords ---------------> Handle missing value
movie_df['keywords'].isnull().sum()

0

In [13]:
# keywords ------------> convert to List
movie_df.loc[:,'keywords'] = movie_df['keywords'].apply(convert)

In [14]:
# popularity --------------> eliminate faction values
movie_df.loc[:, 'popularity'] = movie_df['popularity'].apply(lambda x:round(x))

In [15]:
# cast -----------------> convert to List ----> First three(3)
def convert_3(obj):
    L = []
    for i in json.loads(obj):
        if len(L) < 3:
           L.append(i['name'])
        else:
            break
    return L

In [16]:
# cast ------------> convert to List ----> First Three (3)
movie_df.loc[:,'cast']= movie_df['cast'].apply(convert_3)

In [17]:
# New column ---> star1
def star1(obj):
    if len(obj)>0:
        return obj[0]
    else:
        None

movie_df['star1'] = movie_df['cast'].apply(star1)

In [18]:
# New column ---> star2
def star2(obj):
    if len(obj)>1:
        return obj[1]
    else:
        None

movie_df['star2'] = movie_df['cast'].apply(star2)

In [19]:
# New column ---> star3
def star3(obj):
    if len(obj)>2:
        return obj[2]
    else:
        None

movie_df['star3'] = movie_df['cast'].apply(star3)

In [20]:
# crew column -----Missing Values
movie_df['crew'].isnull().sum()

0

In [21]:
def director_fetch(obj):
    L = []
    for i in json.loads(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movie_df.loc[:,'crew'] = movie_df['crew'].apply(director_fetch)

In [22]:
# New column ------> director
movie_df['director'] = movie_df['crew'].apply(lambda x:x[0] if x else None)

In [23]:
# Remove spaces in values
movie_df.loc[:,'genres'] = movie_df['genres'].apply(lambda x: [re.sub(r"\s+", "", name) for name in x] )
movie_df.loc[:,'keywords'] = movie_df['keywords'].apply(lambda x:[re.sub(r"\s+","",name) for name in x])
movie_df.loc[:,'cast'] = movie_df['cast'].apply(lambda x: [re.sub(r"\s+","",name) for name in x])
movie_df.loc[:,"crew"] = movie_df['crew'].apply(lambda x: [re.sub(r"\s+", "", name) for name in x])

In [24]:
movie_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,runtime,overview,genres,keywords,popularity,vote_average,vote_count,production_countries,cast,crew,poster_path,release_year,star1,star2,star3,director
0,0,19995,Avatar,162.0,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",150.0,7.2,11800,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,2009,Sam Worthington,Zoe Saldana,Sigourney Weaver,James Cameron
1,1,285,Pirates of the Caribbean: At World's End,169.0,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",139.0,6.9,4500,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg,2007,Johnny Depp,Orlando Bloom,Keira Knightley,Gore Verbinski
2,2,206647,Spectre,148.0,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",107.0,6.3,4466,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],/672kUEMtTHcaVYSVY4eiHEliHFa.jpg,2015,Daniel Craig,Christoph Waltz,Léa Seydoux,Sam Mendes
3,3,49026,The Dark Knight Rises,165.0,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...",112.0,7.6,9106,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg,2012,Christian Bale,Michael Caine,Gary Oldman,Christopher Nolan
4,4,49529,John Carter,132.0,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",44.0,6.1,2124,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],/lCxz1Yus07QCQQCb6I0Dr3Lmqpx.jpg,2012,Taylor Kitsch,Lynn Collins,Samantha Morton,Andrew Stanton


In [25]:
# Combined all tags
movie_df['tags'] = movie_df['overview'] + movie_df['genres'] + movie_df['keywords'] + movie_df['cast'] + movie_df['crew']

In [26]:
# Drop unnecessarty columns
movie_df.drop(columns=['genres','keywords','cast','crew'], inplace=True)

In [27]:
movie_df['overview'] = movie_df['overview'].apply(lambda x:" ".join(x))

In [28]:
movie_df.loc[:,'tags']= movie_df['tags'].apply(lambda x:" ".join(x))

In [29]:
movie_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,runtime,overview,popularity,vote_average,vote_count,production_countries,poster_path,release_year,star1,star2,star3,director,tags
0,0,19995,Avatar,162.0,"In the 22nd century, a paraplegic Marine is di...",150.0,7.2,11800,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,2009,Sam Worthington,Zoe Saldana,Sigourney Weaver,James Cameron,"In the 22nd century, a paraplegic Marine is di..."
1,1,285,Pirates of the Caribbean: At World's End,169.0,"Captain Barbossa, long believed to be dead, ha...",139.0,6.9,4500,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg,2007,Johnny Depp,Orlando Bloom,Keira Knightley,Gore Verbinski,"Captain Barbossa, long believed to be dead, ha..."
2,2,206647,Spectre,148.0,A cryptic message from Bond’s past sends him o...,107.0,6.3,4466,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",/672kUEMtTHcaVYSVY4eiHEliHFa.jpg,2015,Daniel Craig,Christoph Waltz,Léa Seydoux,Sam Mendes,A cryptic message from Bond’s past sends him o...


### Note: 1 
```
   ['partho', 'abhik'] -------> 'partho abhik' ---------> ['partho','abhik']

a= ['partho','abhik']  
b= " ".join(a) --> output: 'partho abhik'  
b.split() -------> output: ['partho','abhik']
```

In [31]:
# Convert to ----------------> lowercase
movie_df.loc[:,'tags'] = movie_df['tags'].apply(lambda x:x.lower())

In [32]:
# stremmer -------------------->

ps = PorterStemmer()

In [33]:
# -----> 
def word_stem(text):
    stemd_text_list = []
    for i in text.split():
        stemd_text_list.append(ps.stem(i))
    return " ".join(stemd_text_list)

In [34]:
# -------> Stemmer
movie_df.loc[:,'tags'] = movie_df['tags'].apply(word_stem)

In [35]:
movie_df['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [36]:
# Vectorization ---------------->

cv = CountVectorizer(max_features=5000, stop_words='english')

In [37]:
vectors = cv.fit_transform(movie_df['tags']).toarray()

In [38]:
# Cosine_similarity
similarity_vector = cosine_similarity(vectors)

In [39]:
## Function for extract -----------> movie list 
def recommended_movie(movie):
    movie_index = movie_df[movie_df['title']  == movie].index[0]
    similarity_row = similarity_vector[movie_index]
    recommended_movie_index = np.argsort(similarity_row)[::-1][1:10]
    recommended_movie_list = []
    for i in recommended_movie_index:
        recommended_movie_list.append(movie_df['title'].loc[i])
    return recommended_movie_list

In [40]:
recommended_movie('Swordfish')

['Hackers',
 'I Am Wrath',
 'Pocketful of Miracles',
 'Mission: Impossible II',
 'Mean Machine',
 'Ghost Dog: The Way of the Samurai',
 'Taken 2',
 'The Sentinel',
 'Akira']

In [41]:
movie_df.drop(columns=['Unnamed: 0'], inplace=True)

In [42]:
# Export files
movie_df.to_csv('data/movie_df_processed.csv')

In [43]:
# save similarity_vector
np.save('data/similarity_vector.npy', similarity_vector)

In [50]:
# All movie List

movie_list = sorted(movie_df['title'].unique())

In [45]:
movie_df.head()

Unnamed: 0,id,title,runtime,overview,popularity,vote_average,vote_count,production_countries,poster_path,release_year,star1,star2,star3,director,tags
0,19995,Avatar,162.0,"In the 22nd century, a paraplegic Marine is di...",150.0,7.2,11800,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,2009,Sam Worthington,Zoe Saldana,Sigourney Weaver,James Cameron,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,169.0,"Captain Barbossa, long believed to be dead, ha...",139.0,6.9,4500,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg,2007,Johnny Depp,Orlando Bloom,Keira Knightley,Gore Verbinski,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,148.0,A cryptic message from Bond’s past sends him o...,107.0,6.3,4466,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",/672kUEMtTHcaVYSVY4eiHEliHFa.jpg,2015,Daniel Craig,Christoph Waltz,Léa Seydoux,Sam Mendes,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,165.0,Following the death of District Attorney Harve...,112.0,7.6,9106,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg,2012,Christian Bale,Michael Caine,Gary Oldman,Christopher Nolan,follow the death of district attorney harvey d...
4,49529,John Carter,132.0,"John Carter is a war-weary, former military ca...",44.0,6.1,2124,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/lCxz1Yus07QCQQCb6I0Dr3Lmqpx.jpg,2012,Taylor Kitsch,Lynn Collins,Samantha Morton,Andrew Stanton,"john carter is a war-weary, former militari ca..."


In [46]:
## Function for extract ----------> DataFrame
def recommended_movie_df(movie):
    movie_index = movie_df[movie_df['title']  == movie].index[0]
    similarity_row = similarity_vector[movie_index]
    recommended_movie_index = np.argsort(similarity_row)[::-1][1:10]
    
    df_list = [] 
    for i in recommended_movie_index:
        df_temp = movie_df.loc[[i]]
        df_list.append(df_temp)
    df = pd.concat(df_list, ignore_index=True)
    return df

In [47]:
recom_df = recommended_movie_df('Avatar')
recom_df.head(20)

Unnamed: 0,id,title,runtime,overview,popularity,vote_average,vote_count,production_countries,poster_path,release_year,star1,star2,star3,director,tags
0,440,Aliens vs Predator: Requiem,94.0,"A sequel to 2004's Alien vs. Predator, the ico...",39.0,4.9,740,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/jCyJN1vj8jqJJ0vNw4hDH2KlySO.jpg,2007,Steven Pasquale,Reiko Aylesworth,John Ortiz,Colin Strause,"a sequel to 2004' alien vs. predator, the icon..."
1,679,Aliens,137.0,When Ripley's lifepod is found by a salvage cr...,68.0,7.7,3220,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",/r1x5JGpyqZU8PYhbs4UcrO1Xb6x.jpg,1986,Sigourney Weaver,Michael Biehn,James Remar,James Cameron,when ripley' lifepod is found by a salvag crew...
2,17663,Anne of Green Gables,199.0,At the turn of the century on Prince Edward Is...,9.0,8.2,68,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/z1owlDzKO3DQpFKNFBnuQlPF891.jpg,1985,Megan Follows,Colleen Dewhurst,Richard Farnsworth,Kevin Sullivan,at the turn of the centuri on princ edward isl...
3,602,Independence Day,145.0,"On July 2, a giant alien mothership enters orb...",60.0,6.7,3260,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/p0BPQGSPoSa8Ml0DAf2mB2kCU0R.jpg,1996,Will Smith,Bill Pullman,Jeff Goldblum,Roland Emmerich,"on juli 2, a giant alien mothership enter orbi..."
4,7450,Titan A.E.,94.0,A young man finds out that he holds the key to...,14.0,6.3,313,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/el2iHk3LTJWfEnwrvcRkvWY501G.jpg,2000,Matt Damon,Bill Pullman,Drew Barrymore,Gary Goldman,a young man find out that he hold the key to r...
5,44943,Battle: Los Angeles,116.0,The Earth is attacked by unknown forces. As pe...,49.0,5.5,1448,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/jloyGeVYZSxM9zsLFvVOWuj2ey4.jpg,2011,Aaron Eckhart,Ramón Rodríguez,Will Rothhaar,Jonathan Liebesman,the earth is attack by unknown forces. as peop...
6,34851,Predators,107.0,A mercenary reluctantly leads a motley crew of...,48.0,6.0,1206,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/54LZIRznS0Zjn7ZDXrsGVe3HXei.jpg,2010,Topher Grace,Alice Braga,Laurence Fishburne,Nimród Antal,a mercenari reluctantli lead a motley crew of ...
7,11551,Small Soldiers,110.0,When missile technology is used to enhance toy...,23.0,6.2,511,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/2nuUjSzHsoYlRvTPmLo7m7gCQry.jpg,1998,Gregory Smith,Kirsten Dunst,Denis Leary,Joe Dante,when missil technolog is use to enhanc toy act...
8,76757,Jupiter Ascending,124.0,In a universe where human genetic material is ...,85.0,5.2,2768,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",/xzQ25m9vrdyvpLX74T3B3KB40Ou.jpg,2015,Mila Kunis,Channing Tatum,Sean Bean,Lilly Wachowski,in a univers where human genet materi is the m...


In [48]:
recom_df['poster_path'][0]

'/jCyJN1vj8jqJJ0vNw4hDH2KlySO.jpg'

In [52]:
# Poster_path 
BASE_URL = "https://image.tmdb.org/t/p/w500"
BASE_URL+recom_df['poster_path'][0]

'https://image.tmdb.org/t/p/w500/jCyJN1vj8jqJJ0vNw4hDH2KlySO.jpg'

In [54]:
movie= 'Avatar'

In [68]:
df = movie_df[movie_df['title'] == movie]
df['title'][0]

'Avatar'