In [1]:
import pandas as pd 
import numpy as np 
import os 

In [2]:
movies = pd.read_csv('movies.csv')
rating = pd.read_csv('ratings.csv') 

In [3]:
rating.head() 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head() 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
data = pd.merge(rating, movies, on = 'movieId')
data.head() 

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
data.shape 

(100836, 6)

In [8]:
unique_genres = set() 

for genres in data['genres']: 
    genre_list = genres.split('|')
    for genre in genre_list:
        unique_genres.add(genre) 

In [10]:
genres 

'Action|Crime|Drama|Thriller'

In [9]:
all_genres = sorted(list(unique_genres))
all_genres 

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [11]:
def genre_to_vector(genres, all_genres):
    vector = [0] * len(all_genres) 
    genre_list = genres.split('|') 
    for genre in genre_list: 
        if genre in all_genres:
            index = all_genres.index(genre) 
        vector[index] = 1 
    return vector                     

In [12]:
data['genres'][1] 

'Comedy|Romance'

In [17]:
genre_to_vector(data['genres'][1], all_genres) 


[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [18]:
moviedict = {} 

for index, row in data.iterrows(): 
    movieId = row['movieId'] 
    title = row['title'] 
    genre_vector = genre_to_vector(row['genres'], all_genres) 
    rating = row['rating'] 
    moviedict[movieId] = (title, genre_vector, rating) 

In [19]:
moviedict[1] 

('Toy Story (1995)',
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5.0)

In [20]:
from scipy.spatial.distance import cosine 

def compute_distance(movie_a, movie_b): 
    genre_distance = cosine(movie_a[1], movie_b[1]) 
    rating_distance = abs(movie_a[2] - movie_b[2]) 
    total_distance = genre_distance + rating_distance 
    return total_distance, genre_distance, rating_distance 

In [21]:
compute_distance(moviedict[1], moviedict[2]) 

(np.float64(3.2254033307585166), np.float64(0.2254033307585166), 3.0)

In [22]:
import operator 

def get_neighbors(movie_id, k):
    distances = []
    for movie in moviedict:
        if movie != movie_id:
            dist = compute_distance(moviedict[movie_id], moviedict[movie])
            distances.append((movie, dist)) 

    distances.sort(key=operator.itemgetter(1)) 

    neighbors = [] 
    for i in range(min(k, len(distances))):
        neighbors.append(distances[i][0])
    return neighbors                     

In [23]:
get_neighbors(1,5) 

[3114, 91355, 166461, 78499, 117851]

In [24]:
movie_id = int(input("Enter the movie ID:")) 

movie_name = moviedict[movie_id]
print("The movie name is :", movie_name[0])
neighbors = get_neighbors(movie_id, k=5)
print("Movies Recommended")
for neighbor in neighbors:
    
    print(f"{moviedict[neighbor][0]} {moviedict[neighbor][2]}")


The movie name is : Georgia (1995)
Movies Recommended
Doors, The (1991) 4.0
In the Name of the Father (1993) 4.0
Amadeus (1984) 4.0
Elizabeth (1998) 4.0
Nights of Cabiria (Notti di Cabiria, Le) (1957) 4.0
