In [1]:
import pandas as pd
import numpy as np
import sklearn
import ast
import nltk
import streamlit as st
import pickle
import streamlit_jupyter

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [12]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [13]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [14]:
credits.iloc[0]

movie_id                                                19995
title                                                  Avatar
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

In [15]:
movies = movies.merge(credits, on='title')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [16]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [17]:
movies.shape

(4809, 23)

In [19]:
movies.iloc[0]

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

In [11]:
movies = movies[['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew' ]]
movies.head(1)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
movies.isnull().sum()

In [None]:
movies = movies.dropna()

In [19]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [20]:
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [21]:
def group(name):
    List = []
    for i in ast.literal_eval(name):
        List.append(i['name'])
    return List

In [22]:
movies['genres'] = movies['genres'].apply(group)
movies.iloc[0].genres

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [23]:
movies['keywords'] = movies['keywords'].apply(group)
movies.iloc[0].keywords

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [24]:
def group(name):
    List = []
    count = 0
    for i in ast.literal_eval(name):
        List.append(i['name'])
        count+=1
        if count == 3:
            break
        else:
            pass
    return List

In [25]:
movies['cast'] = movies['cast'].apply(group)
movies.iloc[0].cast

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [26]:
movies.iloc[0].crew

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [27]:
def group(name):
    List = []
    for i in ast.literal_eval(name):
        if i['job'] == 'Director':
            List.append(i['name'])
            break
    return List

In [28]:
movies['crew'] = movies['crew'].apply(group)
movies.iloc[0].crew

['James Cameron']

In [29]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [30]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies.iloc[0].genres

['Action', 'Adventure', 'Fantasy', 'ScienceFiction']

In [31]:
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [32]:
movies.head(1)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [33]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies.iloc[0].overview

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [34]:
movies['overview'] = movies['overview'].apply(lambda x: [i.lower() for i in x])
movies.iloc[0].overview

['in',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [35]:
movies['movie_tag'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head(1)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew,movie_tag
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[in, the, 22nd, century,, a, paraplegic, marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[in, the, 22nd, century,, a, paraplegic, marin..."


In [36]:
movies_list = movies[['id', 'title', 'movie_tag']]
movies_list.head()

Unnamed: 0,id,title,movie_tag
0,19995,Avatar,"[in, the, 22nd, century,, a, paraplegic, marin..."
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[a, cryptic, message, from, bond’s, past, send..."
3,49026,The Dark Knight Rises,"[following, the, death, of, district, attorney..."
4,49529,John Carter,"[john, carter, is, a, war-weary,, former, mili..."


In [37]:
movies_list['movie_tag'] = movies_list['movie_tag'].apply(lambda x: " ".join(x))
movies_list.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_list['movie_tag'] = movies_list['movie_tag'].apply(lambda x: " ".join(x))


Unnamed: 0,id,title,movie_tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [38]:
cv = CountVectorizer(max_features=5000, stop_words='english')
tag_vector = cv.fit_transform(movies_list['movie_tag']).toarray()

In [39]:
np.set_printoptions(threshold=np.inf)

In [40]:
cv.get_feature_names_out()

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '18', '18th', '19', '1930s', '1940s', '1944', '1950',
       '1950s', '1960s', '1970s', '1971', '1974', '1976', '1980', '1980s',
       '1985', '1990s', '19th', '19thcentury', '20', '200', '2009',
       '20th', '21st', '23', '24', '25', '30', '300', '3d', '40', '50',
       '500', '60', '60s', '70', '70s', 'aaron', 'aaroneckhart',
       'abandoned', 'abducted', 'abigailbreslin', 'abilities', 'ability',
       'able', 'aboard', 'abuse', 'abusive', 'academic', 'academy',
       'accept', 'accepted', 'accepts', 'access', 'accident',
       'accidental', 'accidentally', 'accompanied', 'accomplish',
       'account', 'accountant', 'accused', 'ace', 'achieve', 'act',
       'acting', 'action', 'actionhero', 'actions', 'activist',
       'activities', 'activity', 'actor', 'actors', 'actress', 'acts',
       'actual', 'actually', 'adam', 'adams', 'adamsandler',
       'adamshankman', 'adaptation', 'adapted', '

In [41]:
stemmer = PorterStemmer()

In [42]:
def stem(obj):
    List = []
    for i in obj.split():
        List.append(stemmer.stem(i))
    return " ".join(List)

In [43]:
movies_list['movie_tag'] = movies_list['movie_tag'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_list['movie_tag'] = movies_list['movie_tag'].apply(stem)


In [44]:
cv = CountVectorizer(max_features=5000, stop_words='english')
tag_vector = cv.fit_transform(movies_list['movie_tag']).toarray()

In [45]:
cv.get_feature_names_out()

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '17th', '18', '18th', '18thcenturi', '19', '1910', '1920',
       '1930', '1940', '1944', '1950', '1950s', '1960', '1960s', '1970',
       '1970s', '1971', '1974', '1976', '1980', '1985', '1990', '1999',
       '19th', '19thcenturi', '20', '200', '2003', '2009', '20th', '21st',
       '23', '24', '25', '30', '300', '3d', '40', '50', '500', '60', '70',
       '80', 'aaron', 'aaroneckhart', 'abandon', 'abduct',
       'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'abus',
       'academ', 'academi', 'accept', 'access', 'accid', 'accident',
       'acclaim', 'accompani', 'accomplish', 'account', 'accus', 'ace',
       'achiev', 'acquaint', 'act', 'action', 'actionhero', 'activ',
       'activist', 'activities', 'actor', 'actress', 'actual', 'ad',
       'adam', 'adamsandl', 'adamshankman', 'adapt', 'add', 'addict',
       'adjust', 'admir', 'admit', 'adolesc', 'adopt', 'ador',
       'adrienbrodi', 'adu

In [46]:
similar = cosine_similarity(tag_vector)
similar[0]

array([1.        , 0.        , 0.03184649, 0.06063391, 0.12927192,
       0.04711428, 0.02195814, 0.0549235 , 0.        , 0.        ,
       0.        , 0.02678358, 0.        , 0.        , 0.0285831 ,
       0.        , 0.        , 0.06587443, 0.061494  , 0.02251887,
       0.01552675, 0.05970814, 0.        , 0.        , 0.        ,
       0.03656362, 0.08458258, 0.13629326, 0.03300492, 0.02355714,
       0.        , 0.07151985, 0.02355714, 0.        , 0.        ,
       0.        , 0.10713432, 0.02941176, 0.02195814, 0.        ,
       0.03031695, 0.10461316, 0.        , 0.04950738, 0.        ,
       0.04503773, 0.03131121, 0.14890247, 0.04583492, 0.03184649,
       0.        , 0.02160679, 0.08035074, 0.        , 0.        ,
       0.0192951 , 0.03934447, 0.02615329, 0.02528609, 0.01954408,
       0.01967224, 0.11504475, 0.        , 0.04428074, 0.        ,
       0.        , 0.        , 0.09095086, 0.03363364, 0.01414496,
       0.        , 0.        , 0.03131121, 0.        , 0.16977

In [47]:
movies_sort = sorted(list(enumerate(similar[0])), reverse=True, key=lambda x: x[1])
movies_sort

[(0, 1.0),
 (1216, 0.25142264225703365),
 (2409, 0.24404676504598818),
 (582, 0.21613144789263333),
 (507, 0.20812378052829614),
 (3730, 0.20797258270192573),
 (778, 0.2045239970259654),
 (3608, 0.19926334924652142),
 (539, 0.1980295085953348),
 (1920, 0.19474520942613002),
 (1204, 0.18981415059132414),
 (2786, 0.17503501050350123),
 (74, 0.16977493752543305),
 (1089, 0.16516802780306677),
 (3675, 0.1647705109143269),
 (1321, 0.16169041669088866),
 (3538, 0.15877683720748892),
 (529, 0.1569197342897824),
 (151, 0.15339299776947407),
 (2333, 0.150093837982278),
 (373, 0.14969623771302393),
 (1201, 0.14969623771302393),
 (4192, 0.14969623771302393),
 (47, 0.14890247043403096),
 (4048, 0.14777011582226215),
 (2971, 0.14666320798326962),
 (843, 0.1449427589131121),
 (2515, 0.1430397079704303),
 (1071, 0.14291548761875733),
 (305, 0.14269544824634822),
 (3327, 0.14269544824634822),
 (184, 0.14097096860865022),
 (1774, 0.140028008402801),
 (2731, 0.140028008402801),
 (3162, 0.140028008402800

In [48]:
def recommended_movies(movie_name):
    movie_index = movies_list[movies_list['title'] == movie_name].index[0]
    distance = similar[movie_index]
    movies_sort = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_sort:
        print(movies_list.iloc[i[0]].title)

In [49]:
recommended_movies('Batman')

Batman
Batman & Robin
The R.M.
Batman Begins
Batman Returns


In [50]:
pickle.dump(movies_list.to_dict(), open('movie_dict.pkl', 'wb'))

In [51]:
pickle.dump(similar, open('similarity.pkl', 'wb'))

In [56]:
for i in movies_sort:
        print(movies.iloc[i[0]].id)

19995
440
679
44943
602
270938
11260
50357
7450
11954
34851
59678
137113
20856
11935
1091
33155
9567
2310
106
2067
68179
205126
54138
146631
601
80035
9035
7453
9016
2577
228161
5126
97370
348
81796
811
4965
44833
10590
299687
2900
50620
67911
70981
49529
926
395
8814
8077
227735
346
9982
4858
18
608
607
76757
4257
228326
17654
325173
9348
11692
3683
563
9802
25
17431
72545
91314
6964
168
8741
782
95
72710
16016
44912
581
87421
407887
80274
24206
1439
10153
698
10200
11802
9276
12100
32275
189711
180
72710
12412
10208
849
8859
2157
7451
12088
6795
238603
78394
47933
9396
15512
9361
13001
10016
324322
16290
192577
10416
14582
27586
17339
11561
2114
86834
6415
4256
11551
42807
77016
184374
39800
13820
49017
271110
16866
60304
16911
335778
2639
947
75861
138103
10802
71469
13460
2084
9355
26268
8875
43933
1858
50546
12589
394047
13092
2977
667
38356
87101
49047
262500
342502
157336
38050
10366
9549
3509
11818
79698
14275
19959
8975
8321
79777
88036
57876
50942
12120
77951
39513
77877
9947

11287
259693
10184
4234
6114
1213
100042
323675
1677
187
7461
16538
9889
820
6073
16996
193610
19912
296098
32823
4380
10336
11362
50348
48138
227159
708
1586
9913
71864
10761
209451
11975
4970
9096
11011
11172
82687
41446
10537
225886
10385
10154
10647
11431
188161
8850
64685
38357
10060
1833
9306
11370
1921
1683
203
13536
15556
10718
11062
1887
6071
10461
10315
16643
2687
11025
8849
78698
30943
9544
24418
7288
14655
24575
15074
56715
72358
20542
8271
10428
5353
11934
14392
19495
26486
9495
1257
62214
8842
64807
11529
193
44865
55787
257932
10400
1957
256961
5852
12312
622
11306
12508
25793
10534
10871
13503
13600
68722
312221
23168
76494
4944
10488
96721
334
23742
259694
62837
8966
8470
11001
138832
23048
2155
8409
222936
10219
48171
9776
18785
365222
268
268
2661
2661
45054
943
22881
10054
51540
350
109431
1598
271718
2118
10806
377
8843
9313
39486
140823
1883
89492
22949
2959
9957
9366
609
13051
34806
49022
11469
23479
11667
423
2288
88794
13515
8090
3604
9541
94348
8197
336004
350

111794
360188
9372
85860
244534
33468
294086
139998
16388
159770
47607
193603
322745
20981
242095
250902
158895
222250
18292
125537
326576
228550
13963
290391
44770
69382
220490
2292
42497
33693
14585
185465
366967
255266
226458
24055
287625
692
39851
157185
36095
182291
286939
67238
9367
72766
126186
25975
