In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv("movies1.csv")
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bondâ€™s past sends him...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [5]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [6]:
df.shape

(4803, 24)

In [7]:
features = ["genres","keywords","original_language","title","cast","director"]
for feature in features:
    df[feature]=df[feature].fillna("")

In [8]:
def combined_features(row):
    return row["title"]+","+row["genres"]+","+row["keywords"]+","+row["original_language"]+","+row["cast"]+","+row["director"]
df["combined_features"]=df.apply(combined_features,axis=1)
df["combined_features"]

0       Avatar,Action Adventure Fantasy Science Fictio...
1       Pirates of the Caribbean: At World's End,Adven...
2       Spectre,Action Adventure Crime,spy based on no...
3       The Dark Knight Rises,Action Crime Drama Thril...
4       John Carter,Action Adventure Science Fiction,b...
                              ...                        
4798    El Mariachi,Action Crime Thriller,united state...
4799    Newlyweds,Comedy Romance,,en,Edward Burns Kerr...
4800    Signed, Sealed, Delivered,Comedy Drama Romance...
4801    Shanghai Calling,,,en,Daniel Henney Eliza Coup...
4802    My Date with Drew,Documentary,obsession camcor...
Name: combined_features, Length: 4803, dtype: object

In [9]:
tfid = TfidfVectorizer()
tfidv = tfid.fit_transform(df["combined_features"])
tfidv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
tfidv.shape

(4803, 17513)

In [11]:
cosine_sim = cosine_similarity(tfidv)
cosine_sim

array([[1.00000000e+00, 2.48272960e-02, 4.22228416e-02, ...,
        1.12200797e-03, 1.34747833e-03, 1.11917020e-03],
       [2.48272960e-02, 1.00000000e+00, 1.42630716e-02, ...,
        4.07129084e-02, 1.11032424e-03, 9.22197982e-04],
       [4.22228416e-02, 1.42630716e-02, 1.00000000e+00, ...,
        1.15436886e-03, 5.90455012e-02, 1.15144924e-03],
       ...,
       [1.12200797e-03, 4.07129084e-02, 1.15436886e-03, ...,
        1.00000000e+00, 1.17574481e-03, 5.83911241e-02],
       [1.34747833e-03, 1.11032424e-03, 5.90455012e-02, ...,
        1.17574481e-03, 1.00000000e+00, 1.17277112e-03],
       [1.11917020e-03, 9.22197982e-04, 1.15144924e-03, ...,
        5.83911241e-02, 1.17277112e-03, 1.00000000e+00]])

In [16]:
movie = input("Enter any movie: ")
def get_index(mn):
    return df[df.title==mn].index[0]
mi = get_index(movie)
mi

Enter any movie:  Titanic


25

In [17]:
sm = list(enumerate(cosine_sim[mi]))
print(sm)

[(0, 0.04646807786430683), (1, 0.0010429131977650898), (2, 0.0013021733168709717), (3, 0.011040236000339773), (4, 0.0013040474457989284), (5, 0.014778863688448865), (6, 0.0013184582115768936), (7, 0.0011696638396653492), (8, 0.0012047085320278325), (9, 0.001155021536602613), (10, 0.03954306864757933), (11, 0.0068796632907738246), (12, 0.001129678250271549), (13, 0.012715984123598454), (14, 0.0012733247788721173), (15, 0.0010855285988552801), (16, 0.0012263258991562283), (17, 0.0011549458754945507), (18, 0.0011910867965528053), (19, 0.001146352502556208), (20, 0.0012003612664817016), (21, 0.001141347464627862), (22, 0.0011463705597044458), (23, 0.0011797037710374575), (24, 0.004349551261441821), (25, 1.0000000000000002), (26, 0.0011720940670307868), (27, 0.0067675108696578085), (28, 0.007204427504309721), (29, 0.007645670738779607), (30, 0.014748830247124284), (31, 0.0012930267740693725), (32, 0.0010869742652809358), (33, 0.006959325145174253), (34, 0.030141582370788563), (35, 0.0012122

In [18]:
sorted_sm = sorted(sm,key=lambda x:x[1],reverse=True)
sorted_sm

[(25, 1.0000000000000002),
 (1081, 0.18163108502781283),
 (1269, 0.17400263731693877),
 (454, 0.15725920759983),
 (3103, 0.13989401353571979),
 (2826, 0.13592824923176716),
 (2011, 0.13182724000329746),
 (250, 0.12748853328980422),
 (765, 0.1268722873210275),
 (316, 0.12492766457252494),
 (1629, 0.12481164620213096),
 (656, 0.12320402426839125),
 (2098, 0.1217727918312922),
 (142, 0.12046767971806385),
 (395, 0.11279831289979614),
 (351, 0.11086334121013705),
 (2955, 0.10733879248627892),
 (2008, 0.10557537813675008),
 (4133, 0.10315612022576615),
 (872, 0.100940165998464),
 (4375, 0.10043681488005729),
 (49, 0.10010493540602186),
 (439, 0.09959449524423225),
 (1409, 0.09913970980223767),
 (1985, 0.09840081075526325),
 (1362, 0.0982953118594129),
 (297, 0.0980668859018584),
 (622, 0.09777738121438276),
 (972, 0.09687270294155359),
 (2870, 0.09614603219981632),
 (1089, 0.09606602169348885),
 (1690, 0.09595180151333497),
 (1515, 0.09561848358353106),
 (298, 0.09539170170298292),
 (4231, 

In [20]:
def get_info(index):
    return df[df.index==index]["title"].values[0]+": "+df[df.index==index]["cast"].values[0]
i = 0
for movie in sorted_sm:
    print(get_info(movie[0]))
    i = i+1
    if i > 10:
        break

Titanic: Kate Winslet Leonardo DiCaprio Frances Fisher Billy Zane Kathy Bates
Revolutionary Road: Leonardo DiCaprio Kate Winslet Michael Shannon Kathryn Hahn David Harbour
Raise the Titanic: M. Emmet Walsh Richard Jordan David Selby Anne Archer Alec Guinness
The Day the Earth Stood Still: Keanu Reeves Jennifer Connelly Kathy Bates Jaden Smith John Cleese
The Chambermaid on the Titanic: Olivier Martinez Aitana S\u00e1nchez-Gij\u00f3n Romane Bohringer Didier Bezace
Time Bandits: Sean Connery John Cleese Shelley Duvall Michael Palin Ian Holm
Cheri: Michelle Pfeiffer Kathy Bates Rupert Friend Felicity Jones Iben Hjejle
The Aviator: Leonardo DiCaprio Cate Blanchett Kate Beckinsale John C. Reilly Alec Baldwin
Almost Famous: Kate Hudson Billy Crudup Frances McDormand Jason Lee Patrick Fugit
Gangs of New York: Leonardo DiCaprio Daniel Day-Lewis Cameron Diaz Liam Neeson Brendan Gleeson
Little Black Book: Brittany Murphy Holly Hunter Kathy Bates Ron Livingston Kevin Sussman
