In [1]:
import numpy as np
import pandas as pd

## Loding Dataset

In [2]:
movies_data = pd.read_csv(r'C:\Users\pranj\Desktop\Datasets\Recommender System Datasets\movies.csv')
movies_data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
movies_data.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [4]:
movies_data.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

### Text Cleaning using Regex

In [5]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [6]:
movies_data['cleaned_titles'] = movies_data['title'].apply(clean_title)
movies_data['cleaned_titles']

0                               Toy Story 1995
1                                 Jumanji 1995
2                        Grumpier Old Men 1995
3                       Waiting to Exhale 1995
4             Father of the Bride Part II 1995
                         ...                  
9737    Black Butler Book of the Atlantic 2017
9738                 No Game No Life Zero 2017
9739                                Flint 2017
9740          Bungo Stray Dogs Dead Apple 2018
9741          Andrew Dice Clay Dice Rules 1991
Name: cleaned_titles, Length: 9742, dtype: object

In [7]:
movies_data

Unnamed: 0,movieId,title,genres,cleaned_titles
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


## Loading Tf-idf Transformer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_data['cleaned_titles'])
tfidf

<9742x33421 sparse matrix of type '<class 'numpy.float64'>'
	with 70422 stored elements in Compressed Sparse Row format>

## Content Based Movie Recommendation

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search_movie(title):
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -3)[-5:]
    results = movies_data.title.iloc[indices].iloc[::-1]
    
    return results

In [10]:
search_movie("avengers")

7693          Avengers, The (2012)
1611          Avengers, The (1998)
6148      Ultimate Avengers (2006)
9488    Ultimate Avengers 2 (2006)
9153        Masked Avengers (1981)
Name: title, dtype: object

In [11]:
ratings_data = pd.read_csv('Downloads/ratings.csv')
ratings_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
ratings_df = ratings_data.drop(['timestamp'], axis = 'columns')
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [13]:
ratings_df.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [14]:
new_df = pd.merge(movies_data,ratings_df, on= 'movieId')
new_df

Unnamed: 0,movieId,title,genres,cleaned_titles,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,17,4.5
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017,184,3.5
100833,193585,Flint (2017),Drama,Flint 2017,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018,184,3.5


## Importing Surprise library

In [15]:
from surprise import Dataset, Reader
from surprise import accuracy, KNNBasic, similarities
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import predictions

In [16]:
reader = Reader(line_format='user item rating ',sep=',', skip_lines=1, rating_scale=(1,5))
data = Dataset.load_from_file('Downloads/ratings.csv', reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7f0d47390700>

In [17]:
trainset = data.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x7f0d474fec40>

## Using KNNBasic algorithm from surprise library to train my model

In [18]:
sim_options = {
    "name": "cosine",
    "user_based": False,
}
algo = KNNBasic(sim_options=sim_options)
similarity_matrix = algo.fit(trainset = trainset).compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
# Cross Validation

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True);

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9721  0.9758  0.9768  0.9782  0.9781  0.9762  0.0022  
MAE (testset)     0.7599  0.7607  0.7603  0.7588  0.7611  0.7602  0.0008  
Fit time          6.49    3.83    5.17    3.97    3.82    4.66    1.05    
Test time         4.07    4.19    4.16    4.51    4.01    4.19    0.17    


In [20]:
import heapq
from collections import defaultdict
from operator import itemgetter

In [21]:
test_iid = '307' # using this user id to get recommend movies
iid = trainset.to_inner_uid(test_iid)
test_subject_ratings = trainset.ur[iid]
k_neighbors = heapq.nlargest(10, test_subject_ratings, key=lambda t: t[1])

##  Collaborative Filtering Movie Recommendation Systems

In [22]:
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
        similaritities = similarity_matrix[itemID]
        for innerID, score in enumerate(similaritities):
            candidates[innerID] += score * (rating / 5.0)
    except:
        continue

In [23]:
# Getting movie name from the movie id 

import os
import csv
movieID_to_name = {}
Genres = {}
with open('Downloads/movies.csv', newline='', encoding='ISO-8859-1') as csvfile:
    movie_reader = csv.reader(csvfile)
    next(movie_reader)
    for row in movie_reader:
        movieID = int(row[0])
        titles = row[1]
        movieID_to_name[movieID] = titles

In [28]:
def getMovieName(movieID):
    if int(movieID) in movieID_to_name:
        return movieID_to_name[int(movieID)]
    else:
        return ""

In [30]:

watched = {}
for itemID, rating in trainset.ur[iid]:
    watched[itemID] = 1

    
recommendations = []
n = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        recommendations.append(getMovieName(trainset.to_raw_iid(itemID)))
        n += 1
    if (n > 10): 
        break # I want only top 10

for movies in recommendations:
    print("Movie: ", movies)

Movie:  Hideaway (1995)
Movie:  Stealing Home (1988)
Movie:  General, The (1998)
Movie:  Any Which Way You Can (1980)
Movie:  Dragon Ball Z: Super Android 13! (Doragon bÃ´ru Z 7: Kyokugen batoru!! San dai sÃ»pÃ¢ saiyajin) (1992)
Movie:  Dragon Ball Z: Cooler's Revenge (Doragon bÃ´ru Z 5: Tobikkiri no saikyÃ´ tai saikyÃ´) (1991)
Movie:  Nightwatch (1997)
Movie:  Deadly Friend (1986)
Movie:  Whatever It Takes (2000)
Movie:  Ultimate Avengers (2006)
Movie:  Justice League: The New Frontier (2008) 


####                                                                                                                          Pranjal Tripathi