In [1]:
import pandas as pd
import numpy as np
import random

movies = pd.read_csv("movieData/movies.csv")

ratings = pd.read_csv("movieData/ratings.csv")

In [2]:
movies.set_index("movieId", inplace=True)

counts = ratings["movieId"].value_counts()

movies["ratingsCount"] = counts

In [3]:
averageRatings = ratings.groupby("movieId").mean()["rating"]

movies["averageRatings"] = averageRatings

In [4]:
movies.sort_values(["ratingsCount", "averageRatings"], ascending=False)

minimumRatingsSubset = movies.query(f"ratingsCount >= 100").sort_values("averageRatings", ascending=False)

SAMPLE_USER = 1

In [5]:
def findUserRatings(userId):
  
  userRatings = ratings.query(f"userId == {userId}")
  
  return userRatings[["movieId", "rating"]].set_index("movieId")

In [6]:
def calculateDistance(user_1, user_2):
  
  user1_ratings = findUserRatings(user_1)
  user2_ratings = findUserRatings(user_2)
  
  ratingsComparison = user1_ratings.join(user2_ratings, 
                                         lsuffix="_user1", 
                                         rsuffix="_user2").dropna()
  
  user1_compared = ratingsComparison["rating_user1"]
  user2_compared = ratingsComparison["rating_user2"]

  distance = np.linalg.norm(user1_compared - user2_compared)
  
  return [user_1, user_2, distance]

In [7]:
def findRelativeDistances(userId):
  
  users = ratings["userId"].unique()
  
  users = users[users != userId]
  
  distances = [calculateDistance(userId, everyId) for everyId in users]
  
  return pd.DataFrame(distances, columns=["comparedUserId", "userId", "distance"])

In [8]:
def findClosestUsers(userId):
  distancesToUser = findRelativeDistances(userId)
  
  distancesToUser = distancesToUser.sort_values("distance")
  
  return distancesToUser

In [9]:
def makeRecommendation(userId):
  
  userRatings = findUserRatings(userId)
  
  similarUsers = findClosestUsers(userId)
  
  mostSimilarId = similarUsers.iloc[0]
  
  mostSimilarUserRatings = findUserRatings(mostSimilarId.name)
  
  unwatchedMovies = mostSimilarUserRatings.drop(userRatings.index,
                                                errors="ignore")
  
  unwatchedMovies.sort_values("rating", ascending=False)
  
  recommendations = unwatchedMovies.join(movies)
  
  return recommendations

In [10]:
NumberOfNeighbors = 5

def findKNearestNeighbors(userId, k = NumberOfNeighbors):
  
  distancesToUser = findRelativeDistances(userId)
  
  distancesToUser = distancesToUser.sort_values("distance")
  
  distancesToUser = distancesToUser.set_index("userId")
  
  return distancesToUser.head(k)

In [11]:
def makeKNNRecommendation(userId):
  
  topNeighbors = findKNearestNeighbors(userId)
  
  ratingsByIndex = ratings.set_index("userId")
  
  topSimilarRatings = ratingsByIndex.loc[topNeighbors.index]
  
  topSimilarRatingAverage = topSimilarRatings.groupby("movieId").mean()[["rating"]]
  
  recommended_movie = topSimilarRatingAverage.sort_values("rating", ascending=False)
  
  return recommended_movie.join(movies)

In [12]:
NumberOfMovies = 15

minIndex = 1

maxIndex = movies.shape[0]

testUserWatchedMovies = []

for i in range(0, NumberOfMovies):
  
  randomMovieIndex = random.randint(minIndex, maxIndex)
  
  testUserWatchedMovies.append(randomMovieIndex)

minimumRating = 0

maximumRating = 5

testUserRatings = []

for index in range(0, NumberOfMovies):
  
  random_rating = random.randint(minimumRating, maximumRating)
  
  testUserRatings.append(random_rating)
  
user_data = [list(index) for index in zip(testUserWatchedMovies, testUserRatings)]

In [13]:
def addTestUserToDataframe(user_data):
  
  newId = ratings["userId"].max()+1
  
  newUserDataframe = pd.DataFrame(user_data, columns=["movieId", "rating"])
  
  newUserDataframe["userId"] = newId
  
  return pd.concat([ratings, newUserDataframe])

addTestUserToDataframe(user_data)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703.0
1,1,3,4.0,964981247.0
2,1,6,4.0,964982224.0
3,1,47,5.0,964983815.0
4,1,50,5.0,964982931.0
...,...,...,...,...
10,611,8197,3.0,
11,611,9475,5.0,
12,611,6068,3.0,
13,611,6500,0.0,


In [14]:
testUserId = 611

makeKNNRecommendation(testUserId)

Unnamed: 0_level_0,rating,title,genres,ratingsCount,averageRatings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
63082,5.0,Slumdog Millionaire (2008),Crime|Drama|Romance,71.0,3.809859
2403,5.0,First Blood (Rambo: First Blood) (1982),Action|Adventure|Drama|Thriller,30.0,3.550000
2572,5.0,10 Things I Hate About You (1999),Comedy|Romance,54.0,3.527778
2571,5.0,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
53996,5.0,Transformers (2007),Action|Sci-Fi|Thriller|IMAX,39.0,3.346154
...,...,...,...,...,...
7004,0.5,Kindergarten Cop (1990),Action|Comedy|Crime|Thriller,17.0,2.882353
1681,0.5,Mortal Kombat: Annihilation (1997),Action|Adventure|Fantasy,7.0,1.928571
4678,0.5,UHF (1989),Comedy,9.0,3.444444
4679,0.5,Uncle Buck (1989),Comedy,13.0,2.884615
