In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
movies = pd.read_csv('imdb.csv')
movies.head()

Unnamed: 0,imdbID,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000009,Miss Jerry,Miss Jerry,1894,\N,45,Romance,5.3,208
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,\N,100,"Documentary,News,Sport",5.3,493
2,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,\N,70,"Action,Adventure,Biography",6.0,863
3,tt0000591,The Prodigal Son,L'enfant prodigue,1907,\N,90,Drama,5.0,21
4,tt0000630,Hamlet,Amleto,1908,\N,\N,Drama,2.9,27


In [4]:
movies = movies[['imdbID', 'primaryTitle', 'genres', 'averageRating', 'numVotes']]

In [5]:
movies['genres'] = movies['genres'].apply(lambda x: "".join(x))

In [6]:
movies['genres'] = movies['genres'].apply(lambda x: [genre.strip() for genre in x.rstrip(',').split(',') if genre])

In [7]:
movies['genres'].iloc[1]

['Documentary', 'News', 'Sport']

In [8]:
movies.head()

Unnamed: 0,imdbID,primaryTitle,genres,averageRating,numVotes
0,tt0000009,Miss Jerry,[Romance],5.3,208
1,tt0000147,The Corbett-Fitzsimmons Fight,"[Documentary, News, Sport]",5.3,493
2,tt0000574,The Story of the Kelly Gang,"[Action, Adventure, Biography]",6.0,863
3,tt0000591,The Prodigal Son,[Drama],5.0,21
4,tt0000630,Hamlet,[Drama],2.9,27


In [9]:
movies = movies.copy() 
movies['genres'] = movies['genres'].apply(lambda x:" ".join(x))
movies['genres'] = movies['genres'].apply(lambda x:x.lower())

In [10]:
movies.head()

Unnamed: 0,imdbID,primaryTitle,genres,averageRating,numVotes
0,tt0000009,Miss Jerry,romance,5.3,208
1,tt0000147,The Corbett-Fitzsimmons Fight,documentary news sport,5.3,493
2,tt0000574,The Story of the Kelly Gang,action adventure biography,6.0,863
3,tt0000591,The Prodigal Son,drama,5.0,21
4,tt0000630,Hamlet,drama,2.9,27


In [14]:
movies['tags'] = movies['genres'] + " " + movies['averageRating'].astype(str)

In [15]:
movies['tags'][1]

'documentary news sport 5.3'

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
count_vector = CountVectorizer(max_features = 5000, stop_words = 'english')

In [24]:
movies_sample = movies.sample(n=5000, random_state=42)
vectors = count_vector.fit_transform(movies_sample['tags'])
similarity_matrix = cosine_similarity(vectors)

In [25]:
feature_names = count_vector.get_feature_names_out()

In [26]:
feature_names

array(['10', 'action', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'fi', 'film',
       'history', 'horror', 'music', 'musical', 'mystery', 'news', 'noir',
       'reality', 'romance', 'sci', 'sport', 'talk', 'thriller', 'tv',
       'war', 'western'], dtype=object)

In [27]:
import nltk

In [28]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [29]:
def stem(txt):
    l = []
    for i in txt.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [30]:
movies['tags'] = movies['tags'].apply(stem)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
cosine_similarity(vectors)

array([[1.        , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.57735027],
       [1.        , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.57735027, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [33]:
similarity = cosine_similarity(vectors)

In [43]:
def recommend(movie):
    movie_idx = movies[movies['primaryTitle'] == movie].index[0]
    distance = similarity[movie_idx]
    movie_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x: x[1])[1:6]
    for i in movie_list:
        print(movies.iloc[i[0]].primaryTitle)

In [44]:
movies.head()

Unnamed: 0,imdbID,primaryTitle,genres,averageRating,numVotes,tags
0,tt0000009,Miss Jerry,romance,5.3,208,romanc 5.3
1,tt0000147,The Corbett-Fitzsimmons Fight,documentary news sport,5.3,493,documentari news sport 5.3
2,tt0000574,The Story of the Kelly Gang,action adventure biography,6.0,863,action adventur biographi 6.0
3,tt0000591,The Prodigal Son,drama,5.0,21,drama 5.0
4,tt0000630,Hamlet,drama,2.9,27,drama 2.9


In [45]:
recommend('Miss Jerry')

The Story of the Kelly Gang
A Dog's Tale
The Four Devils
Les Misérables, Part 1: Jean Valjean
Sapho


In [46]:
import pickle

In [48]:
pickle.dump(movies.to_dict(), open('movies_dict1.pkl','wb'))

In [49]:
pickle.dump(similarity,open('similarity1.pkl','wb'))

In [50]:
pickle.dump(movies['numVotes'],open('votes.pkl','wb'))