In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

##Reading data

In [None]:
name=['movie_id','movie_name','genres']
movie=pd.read_csv('/content/drive/MyDrive/ML_PROJECTS/movie/Group12_401_406_421_461/movies.csv',header=None,names=name,sep='::')

  


In [None]:
name=['user_id','movie_id','rating','timestamp']
rating=pd.read_csv('/content/drive/MyDrive/ML_PROJECTS/movie/Group12_401_406_421_461/ratings.csv',header=None,names=name,sep='::')

  


In [None]:
movie.shape

(3883, 3)

In [None]:
movie.head()

Unnamed: 0,movie_id,movie_name,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


We have around 3883 movies in our dataset

In [None]:
#Removing the dates from the movie name
def title(x):
  y=x.split('(')
  x=y[0]
  x=x.strip()
  return x

In [None]:
##Removing '|' from the genres columns
movie['title']=movie['movie_name'].map(lambda x:title(x))
movie.drop('movie_name',axis=1,inplace=True)
movie['genres']=movie['genres'].map(lambda x: x.replace("|",""))
rating.drop(columns=['timestamp'],axis=1,inplace=True)

In [None]:
movie.head()

Unnamed: 0,movie_id,genres,title
0,1,AnimationChildren'sComedy,Toy Story
1,2,AdventureChildren'sFantasy,Jumanji
2,3,ComedyRomance,Grumpier Old Men
3,4,ComedyDrama,Waiting to Exhale
4,5,Comedy,Father of the Bride Part II


In [None]:
rating.shape

(1000209, 3)

In [None]:
rating.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


We have around 1000209 ratings given by all the users.

In [None]:
num_users=len(rating.user_id.unique())
num_movie=len(rating.movie_id.unique())
print('There are {} diffrent users and {} unique movies'.format(num_users,num_movie))

There are 6040 diffrent users and 3706 unique movies


The users who have watched less than 50 movies will not provide much information as there most of the values will be nan and just increases the sparsity.

In [None]:
movie_cnt=pd.DataFrame(rating.groupby('movie_id').size(),columns=['count'])
movie_cnt.head()

Unnamed: 0_level_0,count
movie_id,Unnamed: 1_level_1
1,2077
2,701
3,478
4,170
5,296


In [None]:
#Droping the movies which are rated less than 50 times
thres=50
top=list(set(movie_cnt.query('count >= @thres').index))
ratings_drop_movies = rating[rating.movie_id.isin(top)]
print('shape of original ratings data: ', rating.shape)
print('shape of ratings data after dropping unpopular movies: ', ratings_drop_movies.shape)

shape of original ratings data:  (1000209, 3)
shape of ratings data after dropping unpopular movies:  (977839, 3)


In [None]:
users_cnt=pd.DataFrame(ratings_drop_movies.groupby('user_id').size(),columns=['count'])
users_cnt.head()

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
1,53
2,128
3,51
4,21
5,192


In [None]:
#Droping the users which have given less than 50 movies
thres = 50
top_users = list(set(users_cnt.query('count >= @thres').index))
ratings_drop_users = ratings_drop_movies[ratings_drop_movies.user_id.isin(top_users)]
print('shape of original ratings data: ', rating.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', ratings_drop_users.shape)

shape of original ratings data:  (1000209, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (920334, 3)


In [None]:
#creating a matrix from the rating data frame 
#maping the id's with their movie names and storing it
user_movie_mat=ratings_drop_users.pivot(index='movie_id',columns='user_id',values='rating').fillna(0)
mapping={movie:i for i ,movie in enumerate(list(movie.set_index('movie_id').loc[user_movie_mat.index].title))}
user_movie_mat_sparse=csr_matrix(user_movie_mat.values)

In [None]:
#Using KNN model and cosine distance for reccomending movies
model=NearestNeighbors(metric='cosine',algorithm='brute', n_neighbors=20, n_jobs=-1)
model.fit(user_movie_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [None]:
#returns the best possible matches with a movie_name
#Uses fuzz ratio to calculate how similar the two movies are
#Fuzz ratio uses Levenshtein distance 
def similar_name(mapper,movie_name,verbose=True):
  match=[]
  for name,id in mapper.items():
    ratio=fuzz.ratio(name.lower(),movie_name.lower())
    if ratio >= 60:
      match.append((name,id,ratio))
  match=sorted(match,key=lambda x:x[2])[::-1]
  if not match:
    print('No match found')
    return
  else:
    print("Possible matches :{0}\n".format([x[0] for x in match]))
  return match[0][1]

In [None]:
#function to return movies sorted by distance from given input movie in descending order
def recommend(model,data,mapper,movie_name,no_of_reccomendations):
  model.fit(data)
  id=similar_name(mapper,movie_name,verbose=True)
  print('Possible Recommendations are:\n')
  distance,index=model.kneighbors(data[id],n_neighbors=no_of_reccomendations+1)
  raw = sorted(list(zip(index.squeeze().tolist(), distance.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
  rev_mapper={v: k for k,v in mapper.items()}
  for i,(id,dist) in enumerate(raw):
    print('{0}: {1}, with distance of {2}'.format(i+1,rev_mapper[id],dist))

In [None]:
#reccomending a movie
movie_name='Jumanji'
recommend(model=model,data=user_movie_mat_sparse,movie_name=movie_name,mapper=mapping,no_of_reccomendations=10)

Possible matches :['Jumanji', 'Marnie', 'Junior', 'Jules and Jim']

Possible Recommendations are:

1: Willow, with distance of 0.536251983706779
2: Star Wars: Episode I - The Phantom Menace, with distance of 0.5315780268251189
3: Willy Wonka and the Chocolate Factory, with distance of 0.5266764062455752
4: Mask, The, with distance of 0.522934852329396
5: Santa Clause, The, with distance of 0.5206212969385237
6: NeverEnding Story, The, with distance of 0.5161046438346868
7: Honey, I Shrunk the Kids, with distance of 0.5148619624645505
8: Indian in the Cupboard, The, with distance of 0.5104359575439594
9: Dragonheart, with distance of 0.4971789638378191
10: Hook, with distance of 0.42160679514015376
