In [2]:
import pandas as pd
from sklearn.feature_extraction.text  import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def combined_features(row):
    return row['genres']+" " + row['keywords']+" " + row['tagline']+ " "+ row['cast']+ " "+ row['director']+ " " +row['overview'] 

In [4]:
df = pd.read_csv("movie_dataset.csv")

In [5]:
features = ['keywords','cast','tagline','overview','genres','director']
for feature in features:
    df[feature] = df[feature].fillna('')

In [6]:
df['combined_features'] = df.apply(combined_features,axis =1)

In [7]:
df['combined_features']

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Name: combined_features, Length: 4803, dtype: object

In [8]:
cv = CountVectorizer()

In [9]:
count_matrix = cv.fit_transform(df["combined_features"])

In [10]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
cosine_sim = cosine_similarity(count_matrix)

In [12]:
def get_index_from_title(title):
    return df.index[df.title == title][0]

In [13]:
cosine_sim = cosine_similarity(count_matrix)

In [14]:
cosine_sim

array([[1.        , 0.35896368, 0.24467726, ..., 0.29233355, 0.21320072,
        0.11875422],
       [0.35896368, 1.        , 0.33243277, ..., 0.39097542, 0.2896669 ,
        0.16501336],
       [0.24467726, 0.33243277, 1.        , ..., 0.24353367, 0.21252557,
        0.15496777],
       ...,
       [0.29233355, 0.39097542, 0.24353367, ..., 1.        , 0.27749786,
        0.1636604 ],
       [0.21320072, 0.2896669 , 0.21252557, ..., 0.27749786, 1.        ,
        0.20254787],
       [0.11875422, 0.16501336, 0.15496777, ..., 0.1636604 , 0.20254787,
        1.        ]])

In [15]:
movie_user_likes= "Avatar"

In [17]:
movie_index = get_index_from_title(movie_user_likes)

In [19]:
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [20]:
similar_movies

[(0, 0.9999999999999993),
 (1, 0.3589636762825999),
 (2, 0.24467726152216654),
 (3, 0.2995211489365769),
 (4, 0.3721719838017242),
 (5, 0.26804385337361925),
 (6, 0.24765274416239186),
 (7, 0.38259321581983374),
 (8, 0.15386436372416593),
 (9, 0.21222047798472082),
 (10, 0.3228000831375015),
 (11, 0.2724343325001703),
 (12, 0.15666989036012804),
 (13, 0.3026154965396684),
 (14, 0.32054539114212877),
 (15, 0.3544127759602582),
 (16, 0.32328707534629597),
 (17, 0.26694755490067457),
 (18, 0.3102871190979353),
 (19, 0.36715653551125493),
 (20, 0.24283339755490588),
 (21, 0.265157211148662),
 (22, 0.2652073306484289),
 (23, 0.26702293491727636),
 (24, 0.2551772719413704),
 (25, 0.30910890806751007),
 (26, 0.3446561747421316),
 (27, 0.4050520238294316),
 (28, 0.2032242548312348),
 (29, 0.32448650760296144),
 (30, 0.2909435241329683),
 (31, 0.33371190623595726),
 (32, 0.33647740081812577),
 (33, 0.24244056893821345),
 (34, 0.14937887931959076),
 (35, 0.3007603811586847),
 (36, 0.376009516795

In [21]:
sorted_similar_movie = sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [22]:
sorted_similar_movie

[(0, 0.9999999999999993),
 (300, 0.43018214003639427),
 (1214, 0.4299400944881564),
 (342, 0.4246376136398213),
 (1532, 0.42065663167186607),
 (150, 0.42052043365896974),
 (420, 0.4178554470186724),
 (3159, 0.4175266607016781),
 (549, 0.4170827377577314),
 (85, 0.4151029962857377),
 (111, 0.414578098794425),
 (3185, 0.4130401011373831),
 (2697, 0.4129590035587613),
 (1985, 0.41097215501427986),
 (3669, 0.40934557021332185),
 (59, 0.4085140598367578),
 (1960, 0.40745600192971154),
 (461, 0.4073565930967534),
 (1760, 0.40705229290217204),
 (48, 0.40694652112156193),
 (27, 0.4050520238294316),
 (1295, 0.4027745886146973),
 (311, 0.4024717022032483),
 (3900, 0.40225434964529816),
 (1276, 0.40201512610368484),
 (329, 0.4012576068210588),
 (1472, 0.40111179825932497),
 (3193, 0.40072926222805266),
 (847, 0.4006264686689075),
 (1915, 0.3990346477910672),
 (3144, 0.39898070888278053),
 (1634, 0.39894181395195566),
 (3433, 0.3978994603492228),
 (3428, 0.3968806225809197),
 (156, 0.3965232271711

In [23]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

In [24]:
i=0

In [26]:
for movie in sorted_similar_movie:
    print(get_title_from_index(movie[0]))
    i=i+1
    if i>15:
        break

Avatar
Starship Troopers
Aliens vs Predator: Requiem
Men in Black
Moonraker
Men in Black II
Hellboy II: The Golden Army
Alien
Sphere
Captain America: The Winter Soldier
Transformers
The Ice Pirates
Jason X
The Thief and the Cobbler
Capricorn One
2012
