In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

In [3]:
data=pd.read_csv('movies_cleaned_data.csv')

In [4]:
data.head()

Unnamed: 0,movie_id,genres,keywords,overview,cast,crew,title
0,19995,"['Action', 'Adventure', 'Fantasy', 'ScienceFic...","['cultureclash', 'future', 'spacewar', 'spacec...","['In', 'the', '22nd', 'century,', 'a', 'parapl...","['SamWorthington', 'ZoeSaldana', 'SigourneyWea...",['JamesCameron'],Avatar
1,285,"['Adventure', 'Fantasy', 'Action']","['ocean', 'drugabuse', 'exoticisland', 'eastin...","['Captain', 'Barbossa,', 'long', 'believed', '...","['JohnnyDepp', 'OrlandoBloom', 'KeiraKnightley']",['GoreVerbinski'],Pirates of the Caribbean: At World's End
2,206647,"['Action', 'Adventure', 'Crime']","['spy', 'basedonnovel', 'secretagent', 'sequel...","['A', 'cryptic', 'message', 'from', 'Bond’s', ...","['DanielCraig', 'ChristophWaltz', 'LéaSeydoux']",['SamMendes'],Spectre
3,49026,"['Action', 'Crime', 'Drama', 'Thriller']","['dccomics', 'crimefighter', 'terrorist', 'sec...","['Following', 'the', 'death', 'of', 'District'...","['ChristianBale', 'MichaelCaine', 'GaryOldman']",['ChristopherNolan'],The Dark Knight Rises
4,49529,"['Action', 'Adventure', 'ScienceFiction']","['basedonnovel', 'mars', 'medallion', 'spacetr...","['John', 'Carter', 'is', 'a', 'war-weary,', 'f...","['TaylorKitsch', 'LynnCollins', 'SamanthaMorton']",['AndrewStanton'],John Carter


In [5]:
data['tags']=data['genres']

In [6]:
movie_genre=data[['movie_id','title','tags']]

In [7]:
movie_genre.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"['Action', 'Adventure', 'Fantasy', 'ScienceFic..."
1,285,Pirates of the Caribbean: At World's End,"['Adventure', 'Fantasy', 'Action']"
2,206647,Spectre,"['Action', 'Adventure', 'Crime']"
3,49026,The Dark Knight Rises,"['Action', 'Crime', 'Drama', 'Thriller']"
4,49529,John Carter,"['Action', 'Adventure', 'ScienceFiction']"


# Stemming

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer

In [9]:
ps=PorterStemmer()

In [10]:
def stem(text):
    L=[]
    for i in text.split():
        L.append(ps.stem(i))
    return " ".join(L)

In [11]:
movie_genre['tags']=movie_genre['tags'].apply(stem)

In [12]:
movie_genre.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"['action', 'adventure', 'fantasy', 'sciencefic..."
1,285,Pirates of the Caribbean: At World's End,"['adventure', 'fantasy', 'action']"
2,206647,Spectre,"['action', 'adventure', 'crime']"
3,49026,The Dark Knight Rises,"['action', 'crime', 'drama', 'thriller']"
4,49529,John Carter,"['action', 'adventure', 'sciencefiction']"


In [13]:
movie_genre[movie_genre['title']== 'King Kong'].tags

24    ['adventure', 'drama', 'action']
Name: tags, dtype: object

In [14]:
movie_genre[movie_genre['title']== 'X-Men: Apocalypse'].movie_id

64    246655
Name: movie_id, dtype: int64

In [15]:
movie_genre.iloc[373].title

'Mission to Mars'

# Vectorizing

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000,stop_words='english')

In [17]:
vectors=cv.fit_transform(movie_genre['tags']).toarray()

In [18]:
vectors

array([[0.40623644, 0.46953905, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.48056392, 0.55544876, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.51323771, 0.59321401, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.88742277, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [19]:
cv.get_feature_names()

['action',
 'adventure',
 'animation',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'foreign',
 'history',
 'horror',
 'music',
 'mystery',
 'romance',
 'sciencefiction',
 'thriller',
 'tvmovie',
 'war',
 'western']

# Calculating Similarity

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
similarity_genres=cosine_similarity(vectors)

In [22]:
similarity_genres

array([[1.        , 0.8453328 , 0.487033  , ..., 0.        , 0.        ,
        0.        ],
       [0.8453328 , 1.        , 0.57614351, ..., 0.        , 0.        ,
        0.        ],
       [0.487033  , 0.57614351, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [23]:
def recommend_genres(movie):
    movies_index=movie_genre[movie_genre['title']==movie].index[0]
    distances=similarity_genres[movies_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:201]
    
    for i in movies_list:
        print(movie_genre.iloc[i[0]].title)
    
    return

In [24]:
recommend_genres('Avatar')

Superman Returns
Man of Steel
X-Men: Days of Future Past
Jupiter Ascending
The Wolverine
Superman
Superman II
Beastmaster 2: Through the Portal of Time
Teenage Mutant Ninja Turtles
Mystery Men
Small Soldiers
Superman III
Sheena
The Fifth Element
The Shadow
Underworld: Rise of the Lycans
Dragonball Evolution
Jumper
Hellboy II: The Golden Army
The Hunger Games
Suicide Squad
Hellboy
Mortal Kombat: Annihilation
Highlander: The Final Dimension
Highlander: Endgame
Pirates of the Caribbean: At World's End
Spider-Man 3
Batman v Superman: Dawn of Justice
Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: On Stranger Tides
The Hobbit: The Battle of the Five Armies
The Amazing Spider-Man
Spider-Man 2
The Amazing Spider-Man 2
The Mummy: Tomb of the Dragon Emperor
The Hobbit: An Unexpected Journey
Warcraft
Thor: The Dark World
Thor
Pirates of the Caribbean: The Curse of the Black Pearl
Clash of the Titans
The 13th Warrior
The Lord of the Rings: The Fellowship of the Ring
The Mummy

In [25]:
movie_id_file=sorted(list(enumerate(similarity_genres[64])),reverse=True,key=lambda x:x[1])[1:201]

In [26]:
movie_id_file

[(373, 1.0),
 (422, 1.0),
 (549, 1.0),
 (1421, 1.0),
 (1560, 1.0),
 (2904, 1.0),
 (2938, 1.0),
 (3028, 1.0),
 (3218, 1.0),
 (3962, 1.0),
 (4259, 1.0),
 (4280, 1.0),
 (4744, 1.0),
 (301, 0.8781046691286767),
 (1018, 0.8781046691286767),
 (1870, 0.8781046691286767),
 (2093, 0.8781046691286767),
 (2644, 0.8781046691286767),
 (2757, 0.8781046691286767),
 (3159, 0.8781046691286767),
 (3400, 0.8781046691286767),
 (3628, 0.8781046691286767),
 (4383, 0.8781046691286767),
 (4402, 0.8781046691286767),
 (4654, 0.8781046691286767),
 (4691, 0.8781046691286767),
 (4703, 0.8781046691286767),
 (375, 0.8440657436462732),
 (2021, 0.8440657436462732),
 (2488, 0.8440657436462732),
 (2611, 0.8440657436462732),
 (3538, 0.8440657436462732),
 (3993, 0.8440657436462732),
 (4161, 0.8440657436462732),
 (502, 0.8078051553921789),
 (723, 0.8078051553921789),
 (1278, 0.8078051553921789),
 (1908, 0.8078051553921789),
 (2660, 0.8078051553921789),
 (2748, 0.8078051553921789),
 (2822, 0.8078051553921789),
 (2870, 0.807

In [27]:
import numpy as np
my_array = np.array(movie_id_file,dtype=np.float32)

In [28]:
my_array[0][0]

373.0

In [29]:
recommendmovie_id=[]
for i in range(0,200):
        m_id=movie_id_file[i][0]
        recommendmovie_id.append(m_id)
# print(a)

In [30]:
recommendmovie_id

[373,
 422,
 549,
 1421,
 1560,
 2904,
 2938,
 3028,
 3218,
 3962,
 4259,
 4280,
 4744,
 301,
 1018,
 1870,
 2093,
 2644,
 2757,
 3159,
 3400,
 3628,
 4383,
 4402,
 4654,
 4691,
 4703,
 375,
 2021,
 2488,
 2611,
 3538,
 3993,
 4161,
 502,
 723,
 1278,
 1908,
 2660,
 2748,
 2822,
 2870,
 3431,
 4009,
 4761,
 4790,
 74,
 223,
 224,
 266,
 415,
 582,
 634,
 854,
 1156,
 1273,
 1614,
 1656,
 3093,
 3464,
 4018,
 4181,
 4192,
 4198,
 2867,
 2796,
 256,
 675,
 1329,
 1965,
 3409,
 239,
 454,
 545,
 720,
 1114,
 1479,
 1731,
 2642,
 2731,
 2829,
 3560,
 4038,
 4545,
 4696,
 4767,
 4799,
 165,
 257,
 1289,
 18,
 311,
 319,
 362,
 1434,
 2161,
 2786,
 3188,
 4405,
 4453,
 4626,
 2003,
 2158,
 2636,
 4431,
 2393,
 2701,
 2846,
 3151,
 3742,
 3891,
 4137,
 4288,
 4469,
 4519,
 4748,
 4604,
 95,
 270,
 363,
 2381,
 3572,
 4336,
 4079,
 43,
 93,
 279,
 366,
 449,
 476,
 487,
 601,
 658,
 718,
 997,
 1005,
 1275,
 1322,
 1405,
 1418,
 1497,
 1574,
 1639,
 1832,
 2001,
 2053,
 2204,
 2259,
 2326,
 24