#### Import data

In [78]:
import numpy as np
import pandas as pd

In [79]:
df = pd.read_csv('../Data/movies.csv')
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...",2010-07-15,8.368,35811,"Action, Science Fiction, Adventure","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,The adventures of a group of explorers who mak...,2014-11-05,8.434,34465,"Adventure, Drama, Science Fiction","Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,8.515,32012,"Drama, Action, Crime, Thriller","Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-15,7.581,30907,"Action, Adventure, Fantasy, Science Fiction","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,When an unexpected enemy emerges and threatens...,2012-04-25,7.714,30090,"Science Fiction, Action, Adventure","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [80]:
df.shape

(10000, 9)

In [81]:
df.columns

Index(['id', 'title', 'description', 'release_date', 'rating', 'vote_count',
       'genres', 'actors', 'director'],
      dtype='object')

### Preprocessing

In [82]:
# Text preprocessing ('This is the <b> Horror </b> movie...' => 'horror movi')

import re
from nltk.stem import PorterStemmer
from nltk.downloader import download
from nltk.corpus import stopwords

download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_desc(text):
    text = str(text)

    #Lowercase
    text = text.lower()

    #Remove html tags
    text = re.sub(r'<.*?>', ' ', text)

    #Remove other Punctuation
    text = re.sub(r'[^\w]', ' ', text)

    #Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    #Stemming
    stemmer = PorterStemmer()
    text = stemmer.stem(text)

    return text
df['description'] = df['description'].apply(preprocess_desc)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
# Genre preprocessing (Science Fiction => sciencefiction)
def preprocess_genre(genre):
    genre = str(genre)
    return ' '.join(g.replace(' ', '').lower() for g in genre.split(','))
df['genres'] = df['genres'].apply(preprocess_genre)

In [84]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skilled thief commits corporate espionage...,2010-07-15,8.368,35811,action sciencefiction adventure,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,adventures group explorers make use newly disc...,2014-11-05,8.434,34465,adventure drama sciencefiction,"Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,batman raises stakes war crime help lt jim gor...,2008-07-16,8.515,32012,drama action crime thriller,"Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,22nd century paraplegic marine dispatched moon...,2009-12-15,7.581,30907,action adventure fantasy sciencefiction,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,unexpected enemy emerges threatens global safe...,2012-04-25,7.714,30090,sciencefiction action adventure,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [85]:
df['actors'][0]

'Leonardo DiCaprio, Joseph Gordon-Levitt, Ken Watanabe, Tom Hardy, Elliot Page'

In [86]:
# Format names of actors and directors
def format_person_name(name):
    name = str(name)
    return ' '.join(n.replace(' ', '').lower() for n in name.split(','))

# Take only top 3 actors of movie
def pick_top_3(actors):
    return ' '.join(actors.split()[:3])

In [87]:
df['actors'] = df['actors'].apply(format_person_name)
df['actors'] = df['actors'].apply(pick_top_3)
df['director'] = df['director'].apply(format_person_name)

In [88]:
df['actors'][0]

'leonardodicaprio josephgordon-levitt kenwatanabe'

In [89]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skilled thief commits corporate espionage...,2010-07-15,8.368,35811,action sciencefiction adventure,leonardodicaprio josephgordon-levitt kenwatanabe,christophernolan
1,157336,Interstellar,adventures group explorers make use newly disc...,2014-11-05,8.434,34465,adventure drama sciencefiction,matthewmcconaughey annehathaway michaelcaine,christophernolan
2,155,The Dark Knight,batman raises stakes war crime help lt jim gor...,2008-07-16,8.515,32012,drama action crime thriller,christianbale heathledger michaelcaine,christophernolan
3,19995,Avatar,22nd century paraplegic marine dispatched moon...,2009-12-15,7.581,30907,action adventure fantasy sciencefiction,samworthington zoesaldaña sigourneyweaver,jamescameron
4,24428,The Avengers,unexpected enemy emerges threatens global safe...,2012-04-25,7.714,30090,sciencefiction action adventure,robertdowneyjr. chrisevans markruffalo,josswhedon


### Vectorization

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

Vectorize description, genres, top 3 actors and directors seprately with count vectorizer

In [91]:
Desc_to_vec = CountVectorizer(max_features=5000, stop_words='english')
desc_vec = Desc_to_vec.fit_transform(df['description'])
desc_vec = pd.DataFrame.sparse.from_spmatrix(desc_vec)

In [92]:
Genre_to_vec = CountVectorizer()
genre_vec = Genre_to_vec.fit_transform(df['genres'])
genre_vec = pd.DataFrame.sparse.from_spmatrix(genre_vec)

In [93]:
Genre_to_vec.get_feature_names_out()

array(['action', 'adventure', 'animation', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'history', 'horror',
       'music', 'mystery', 'nan', 'romance', 'sciencefiction', 'thriller',
       'tvmovie', 'war', 'western'], dtype=object)

In [94]:
Crew_to_vec = CountVectorizer(max_features=500)
crew_vector = Crew_to_vec.fit_transform(df['actors'], df['director'])
crew_vector = pd.DataFrame.sparse.from_spmatrix(crew_vector)

In [95]:
print(desc_vec.shape)
print(genre_vec.shape)
print(crew_vector.shape)

(10000, 5000)
(10000, 20)
(10000, 500)


In [96]:
train_data = pd.concat([desc_vec, genre_vec, crew_vector], axis=1)
train_data.shape

(10000, 5520)

### Cosine Similarity

In [97]:
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
similarity = cosine_similarity(X=train_data)

In [99]:
similarity[0].shape

(10000,)

In [100]:
similarity[0]

array([1.        , 0.0978232 , 0.0325128 , ..., 0.        , 0.06428243,
       0.05504819])

In [101]:
def id_to_index(id):
    return df[df['id'] == id].index[0]

def index_to_id(index):
    return df['id'][index]

In [102]:
def get_top_index_per_row(arr, k=20):
    # Get the number of rows and columns
    num_rows, num_cols = arr.shape
    
    # Initialize an empty array to store the top k values for each row
    top_k_indices_per_row = np.empty((num_rows, k), dtype=int)    

    for i in range(num_rows):
       
        top_k_indices = np.argpartition(similarity[i], -k)[-k:]
        
        # Sort the top k indices by the corresponding values in descending order
        sorted_top_k_indices = top_k_indices[np.argsort(similarity[i, top_k_indices])[::-1]]
        
        # Store the sorted top k indices in the result array
        top_k_indices_per_row[i] = sorted_top_k_indices
    
    return top_k_indices_per_row

similarity_indexed = get_top_index_per_row(similarity, 20)

In [103]:
similarity_indexed

array([[   0, 3763,  416, ..., 1440, 6962, 7594],
       [   1, 4183, 2658, ..., 7464, 1646, 4021],
       [   2,   25, 6956, ...,   18, 2609, 9722],
       ...,
       [9997, 8926, 5812, ..., 6451, 3282, 6476],
       [9998, 7718, 1095, ..., 4793, 2846, 4161],
       [9999, 9781, 7595, ..., 9179, 8409, 8702]])

In [104]:
def recommand(id, start_rank = 1, end_rank = 10):
    movies = []

    # Find index of given id
    try:
        index = id_to_index(id)
    except:
        return 'Invalid id'
    movies_rec = similarity_indexed[index][start_rank:end_rank+1]
    for m in movies_rec:
        movies.append(index_to_id(m))
    return movies

In [105]:
recommand(155)

[49026, 736074, 9546, 736073, 272, 40662, 14919, 414, 268, 80389]

In [106]:
m = 299536
print('movie => ', df['title'][id_to_index(m)])
print('recommandations =>')
for id in recommand(m, start_rank=1, end_rank=10):
    print(df['title'][id_to_index(id)])

movie =>  Avengers: Infinity War
recommandations =>
Avengers: Endgame
Avengers: Age of Ultron
Teenage Mutant Ninja Turtles
Spy Kids 3-D: Game Over
Captain America: Civil War
Green Lantern
Marvel One-Shot: The Consultant
Ant-Man
Thor: Ragnarok
Angel's Egg


#### Save Similarity indexed

In [109]:
import pickle

pickle.dump(similarity_indexed, open('similarity.pkl', 'wb'))