In [3]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'], dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [4]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [7]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [8]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
# rating_with_totalRatingCount.head()

In [9]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [20]:
popularity_threshold = 40
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
# rating_popular_movie.head()

In [11]:
rating_popular_movie.shape

(49630, 5)

In [12]:
##Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from scipy.sparse import csr_matrix

####### to be saved
movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

## save this model
model_knn.fit(movie_features_df_matrix)

In [14]:
import pickle as pk

filename = 'movie_features_df.pkl'
filename2 = 'model_knn.pkl'

# Save the DataFrame to a file using pickle
with open(filename, 'wb') as file:
    pk.dump(movie_features_df, file)

with open(filename2, 'wb') as file:
    pk.dump(model_knn, file)



In [28]:

print(movie_features_df.head(30))

userId                                               1     2     3     4    \
title                                                                        
(500) Days of Summer (2009)                        0.000 0.000 0.000 0.000   
10 Things I Hate About You (1999)                  0.000 0.000 0.000 0.000   
101 Dalmatians (1996)                              0.000 0.000 0.000 0.000   
101 Dalmatians (One Hundred and One Dalmatians)... 0.000 0.000 0.000 0.000   
12 Angry Men (1957)                                0.000 0.000 0.000 5.000   
2001: A Space Odyssey (1968)                       0.000 0.000 0.000 0.000   
28 Days Later (2002)                               0.000 0.000 0.000 0.000   
300 (2007)                                         0.000 0.000 0.000 0.000   
40-Year-Old Virgin, The (2005)                     0.000 0.000 0.000 0.000   
50 First Dates (2004)                              0.000 0.000 0.000 0.000   
8 Mile (2002)                                      0.000 0.000 0

In [16]:

def getMovieIndex( movie_title):
    for i in movie_features_df:
        if movie_features_df.index[i] == movie_title:
            return i  
    return None



In [17]:
def Recommend_movies_knn( movie_title):
    # Get the index of the specified movie title
    query_index = getMovieIndex(movie_title)
    
    # Use the KNN model to find nearest neighbors
    distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index, :].values.reshape(1, -1), n_neighbors=11)
    
    recommended_movies = []
    for i in range(len(distances.flatten())):
        if i == 0:
            print(f"Recommendations for {movie_features_df.index[query_index]}:\n")
        else:
            recommended_movie = movie_features_df.index[indices.flatten()[i]]
            recommended_distance = distances.flatten()[i]
            recommended_movies.append((recommended_movie, recommended_distance))
    
    return recommended_movies


In [18]:
Recommend_movies_knn("Toy Story (1995)")

Recommendations for Toy Story (1995):



[('Toy Story 2 (1999)', 0.42739868),
 ('Jurassic Park (1993)', 0.4343632),
 ('Independence Day (a.k.a. ID4) (1996)', 0.4357382),
 ('Star Wars: Episode IV - A New Hope (1977)', 0.44261175),
 ('Forrest Gump (1994)', 0.452904),
 ('Lion King, The (1994)', 0.4588548),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 0.4589107),
 ('Mission: Impossible (1996)', 0.46108717),
 ('Groundhog Day (1993)', 0.46583116),
 ('Back to the Future (1985)', 0.4696188)]

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

# Split data into train and test sets
train_data, test_data = train_test_split(movie_features_df, test_size=0.2, random_state=42)

# Initialize and fit the KNN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(train_data)

# Make predictions on the test data
distances, indices = model_knn.kneighbors(test_data.values, n_neighbors=10)

# Calculate predicted ratings (e.g., mean ratings of neighbors for each item in the test set)
# Ensure that predicted_ratings has the same length as actual_ratings (test_data)
predicted_ratings = []
for i in range(len(test_data)):
    neighbors_indices = indices[i]
    mean_rating = movie_features_df.iloc[neighbors_indices, :].mean(axis=0)  # Calculate mean rating of neighbors
    predicted_ratings.append(mean_rating)

# Convert predicted_ratings to a numpy array
predicted_ratings = np.array(predicted_ratings)

# Flatten actual_ratings and predicted_ratings to align for R2 score calculation
actual_ratings = test_data.values.flatten()
predicted_ratings = predicted_ratings.flatten()

# Calculate R2 score
# r2 = r2_score(actual_ratings, predicted_ratings)
# print(f"R2 Score: {r2}")

import numpy as np

# Define a threshold (e.g., ±0.5) for considering predictions as accurate
threshold = 0.5

# Calculate the percentage of predictions within the threshold
accurate_predictions = np.abs(predicted_ratings - actual_ratings) <= threshold
accuracy = np.mean(accurate_predictions) * 100  # Convert to percentage

print(f"Accuracy : {accuracy:.2f}%")

Accuracy : 66.68%
