In [None]:
# ML Project : Movies Recommendation system : movies-5000 movie name, director, actor, budget, genre

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
movies=pd.read_csv("Cleaned_Movies.csv")

In [4]:
movies

Unnamed: 0,title,tags
0,avatar,"in the 22nd century, a parapleg marin is dispa..."
1,pirates of the caribbean: at world's end,"captain barbossa, long believ to be dead, ha c..."
2,spectre,a cryptic messag from bond’ past send him on a...
3,the dark knight rises,follow the death of district attorney harvey d...
4,john carter,"john carter is a war-weary, former militari ca..."
...,...,...
4801,el mariachi,el mariachi just want to play hi guitar and ca...
4802,newlyweds,a newlyw couple' honeymoon is upend by the arr...
4803,"signed, sealed, delivered","""signed, sealed, delivered"" introduc a dedic q..."
4804,shanghai calling,when ambiti new york attorney sam is sent to s...


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
cv=CountVectorizer(max_features=5000,stop_words="english")

In [7]:
vectors=cv.fit_transform(movies.tags).toarray()

In [8]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
vectors.shape

(4806, 5000)

In [10]:
vectors[0]       # this is vector of first movie

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [11]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [12]:
cv.vocabulary_

{'century': 736,
 'marin': 2803,
 'dispatch': 1286,
 'moon': 3013,
 'pandora': 3278,
 'uniqu': 4680,
 'mission': 2980,
 'becom': 441,
 'torn': 4552,
 'follow': 1729,
 'order': 3231,
 'protect': 3530,
 'alien': 157,
 'action': 79,
 'adventur': 106,
 'fantasi': 1629,
 'sciencefict': 3924,
 'cultureclash': 1084,
 'futur': 1807,
 'societi': 4147,
 'spacetravel': 4193,
 'futurist': 1809,
 'romanc': 3805,
 'space': 4187,
 'tribe': 4603,
 'alienplanet': 160,
 'soldier': 4154,
 'battl': 424,
 '3d': 47,
 'zoesaldana': 4993,
 'sigourneyweav': 4074,
 'jamescameron': 2327,
 'captain': 676,
 'long': 2696,
 'believ': 453,
 'dead': 1150,
 'ha': 1962,
 'come': 913,
 'life': 2647,
 'head': 2020,
 'edg': 1407,
 'earth': 1395,
 'turner': 4626,
 'elizabeth': 1435,
 'noth': 3168,
 'quit': 3568,
 'ocean': 3194,
 'drugabus': 1357,
 'exoticisland': 1571,
 'loveofone': 2729,
 'slif': 4126,
 'traitor': 4582,
 'shipwreck': 4050,
 'ship': 4049,
 'allianc': 167,
 'afterlif': 119,
 'fighter': 1680,
 'pirat': 3390,


In [63]:
# Now apply stemming: no need to apply stemming  .

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer

In [14]:
ps=PorterStemmer()

In [18]:
# we need to create a function to apply stemming in all words of the tag

In [19]:
def stemming(txt):
    y=[]
    for i in txt.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [21]:
movies.tags=movies.tags.apply(stemming)

In [22]:
movies.tags[0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
cosine_similarity(vectors).shape  # similrity of every movie with all other movies

(4806, 4806)

In [25]:
similarity=cosine_similarity(vectors)   # store the similarity matrix

In [26]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))

In [29]:
def recommend_multiple(movies_list):
    movies_list = [movie.lower() for movie in movies_list]
    recommended_movies = []
    for mo in movies_list:
        index = movies[movies["title"] == mo].index[0]
        distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        for i in distance[1:4]:
            recommended_movies.append(movies.iloc[i[0]].title)
    print(list(set(recommended_movies)))
movies_to_recommend = ["Avatar","Titanic"]
recommend_multiple(movies_to_recommend)

['the notebook', 'aliens vs predator: requiem', 'ghost ship', 'aliens', 'falcon rising', 'under the same moon']
