# `KNN Implementation Notebook`

### `Importing Libraries`

This section contains the Libraries we have used
1. pandas for dataframe processes
2. sklearn and Scipy for using built-in KNN, cosine similarity and Mean Absolute Error
3. Matplotlib for graphing

In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance_matrix
from scipy.spatial.distance import euclidean, pdist, squareform
from scipy import sparse
import numpy as np
import matplotlib.pyplot as plt
import math
import sklearn.metrics
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(92)



In [2]:
data = pd.read_csv("ml-100k/u.data",sep="\t", header=None)
data.columns = ['user id', 'movie id', 'rating', 'timestamp']
data = data.drop(columns=['timestamp'])
data.head()

Unnamed: 0,user id,movie id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
movies = pd.read_csv("ml-100k/u.item",
                    sep="|", encoding='latin-1', header=None)
movies.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy',
                 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
num_movies = data['movie id'].nunique()
print(num_movies)
movies.head()

1682


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
map_id_movie = {}
for id,row in movies.iterrows():
  map_id_movie[row['movie id']] = row['movie title']
data['title'] = data['movie id'].apply(lambda x: map_id_movie.get(x, 'Unknown'))


### `Describe the Data`   

### `KNN Implementation`

#### `1. Pivot data`
  - create a table that is user to movies are are filled inside with ratings
  - Normalize data
    - find average
    - subtract average from all ratings
    - fill null with zero

In [5]:
#pivoting the data
user_ratings_pivot1 = data.pivot_table(index='movie id',columns='user id',values='rating')
user_ratings_pivot1.head()

user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,,,4.0,4.0,,,,4.0,...,2.0,3.0,4.0,,4.0,,,5.0,,
2,3.0,,,,3.0,,,,,,...,4.0,,,,,,,,,5.0
3,4.0,,,,,,,,,,...,,,4.0,,,,,,,
4,3.0,,,,,,5.0,,,4.0,...,5.0,,,,,,2.0,,,
5,3.0,,,,,,,,,,...,,,,,,,,,,


In [6]:
#filling missing values by centralizing
avg_ratings = user_ratings_pivot1.mean(axis = 0)
user_ratings_pivot2 = user_ratings_pivot1.sub(avg_ratings)
user_ratings_pivot2 = user_ratings_pivot2.fillna(0)
user_ratings_pivot2.head()

user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,0.290323,0.0,0.0,1.125714,0.364929,0.0,0.0,0.0,-0.206522,...,-1.701149,-0.923077,0.253521,0.0,0.731481,0.0,0.0,0.954545,0.0,0.0
2,-0.610294,0.0,0.0,0.0,0.125714,0.0,0.0,0.0,0.0,0.0,...,0.298851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.589286
3,0.389706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.253521,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.610294,0.0,0.0,0.0,0.0,0.0,1.034739,0.0,0.0,-0.206522,...,1.298851,0.0,0.0,0.0,0.0,0.0,-1.457944,0.0,0.0,0.0
5,-0.610294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print(avg_ratings[1])
target_user = user_ratings_pivot2.iloc[[1]]
target_user.head()


3.610294117647059


user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.610294,0.0,0.0,0.0,0.125714,0.0,0.0,0.0,0.0,0.0,...,0.298851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.589286


`3. make a dataframe with users who watched this particular movie `

In [8]:
users_watched_this_movie = user_ratings_pivot1[1]
mask = users_watched_this_movie.notnull()
mask[2] = False
df_filled_users_watched_thismovie = user_ratings_pivot2[mask]
users_watched_this_movie = users_watched_this_movie.drop(2)
users_watched_this_movie.dropna(inplace = True)


In [9]:
df_filled_users_watched_thismovie.head()

user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,0.290323,0.0,0.0,1.125714,0.364929,0.0,0.0,0.0,-0.206522,...,-1.701149,-0.923077,0.253521,0.0,0.731481,0.0,0.0,0.954545,0.0,0.0
3,0.389706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.253521,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.610294,0.0,0.0,0.0,0.0,0.0,1.034739,0.0,0.0,-0.206522,...,1.298851,0.0,0.0,0.0,0.0,0.0,-1.457944,0.0,0.0,0.0
5,-0.610294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.389706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.727273,0.0,...,0.0,0.0,1.253521,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
users_watched_this_movie.head()

movie id
1    5.0
3    4.0
4    3.0
5    3.0
6    5.0
Name: 1, dtype: float64

#### `3. Use the KNeighborsRegressor`

`Using cosine similarity`

In [11]:
from sklearn.neighbors import KNeighborsRegressor
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)
user_knn.fit(df_filled_users_watched_thismovie, users_watched_this_movie)
user_user_pred = user_knn.predict(target_user)
print(user_user_pred)

[3.4]


`Using Euclidean Distance`

In [12]:
user_knn3 = KNeighborsRegressor(metric='euclidean', n_neighbors=3)
user_knn3.fit(df_filled_users_watched_thismovie, users_watched_this_movie)
user_user_pred3 = user_knn3.predict(target_user)
print(user_user_pred3)

[3.66666667]


`Using Manhattan Distance`

In [13]:
user_knn4 = KNeighborsRegressor(metric='manhattan', n_neighbors=3)
user_knn4.fit(df_filled_users_watched_thismovie, users_watched_this_movie)
user_user_pred4 = user_knn4.predict(target_user)
print(user_user_pred4)

[3.66666667]


### `Find datapoints that have values`

To be able to evaluate properly, we need to find the datapoints that are already filled with data. Those will be the positions of those in the matrix to be able to refer to them efficiently, without repeatedily trying to find the values with values.

In [14]:
def fitting_data(index):
  cnt = 0
  l = []
  original = []
  list_columns = user_ratings_pivot1.columns
  p = 0
  for i in user_ratings_pivot1.iloc[index]:
    if p == 50:
      break
    if not(np.isnan(i)):
      p += 1
      l.append(list_columns[cnt])
      original.append(i)
    cnt = cnt + 1
  return l,original

In [15]:
def similarity(metric ,movie, list_not_empty , k):
    l = []
    target_user = user_ratings_pivot2.iloc[[movie]]
    for i in list_not_empty:
        users_watched_this_movie = user_ratings_pivot1[i]
        mask = users_watched_this_movie.notnull()
        mask[movie+1] = False
        df_filled_users_watched_thismovie = user_ratings_pivot2[mask]
        users_watched_this_movie = users_watched_this_movie.drop(movie+1)
        users_watched_this_movie.dropna(inplace = True)
        user_knn = KNeighborsRegressor(metric= metric , n_neighbors=k)
        user_knn.fit(df_filled_users_watched_thismovie, users_watched_this_movie)
        user_user_pred = user_knn.predict(target_user)
        l.append(user_user_pred[0])
    return l


In [16]:
def hypertuning():
  best_k= None
  best_error = 2
  k_results =[]
  s = np.random.choice(num_movies,5,replace=False)
  for i in range(1,12):
      test_pred = []
      y_pred = []
      for movie in s:
        l,original = fitting_data(movie)
        test_pred.extend(similarity('cosine',movie, l, i))
        y_pred.extend(original)
      evaluation = sklearn.metrics.mean_squared_error(test_pred,y_pred)
      k_results.append(evaluation)
      print("The evaluation at k = "+ str(i) + " equals to "+ str(evaluation))
      if evaluation < best_error:
          best_k = i
          best_error = evaluation

  print("---------------------Final Best result at K => " + str(best_k)+ "-------------------------------")
  return best_k

In [17]:
def movie_recomendation(user,num_recommendations):
  best_k = hypertuning()
  movie_ratings_of_user = user_ratings_pivot1[user]
  mask = movie_ratings_of_user.notnull()
  X_train = user_ratings_pivot2[mask]
  movie_ratings_of_user.dropna(inplace=True)
  user_knn = KNeighborsRegressor(metric= 'cosine' , n_neighbors=best_k)
  user_knn.fit(X_train, movie_ratings_of_user)
  allmovies = np.arange(1,num_movies+1,1)
  unseen_movies = allmovies[mask==False]
  predicted_ratings = []
  for index in unseen_movies:
    target_movie = user_ratings_pivot2.iloc[[index-1]]
    predicted_ratings.append(user_knn.predict(target_movie)[0])
  predicted_ratings = np.array(predicted_ratings)
  indices = np.argsort(predicted_ratings)[::-1]
  dec_order = unseen_movies[indices][:num_recommendations]
  predicted_ratings = predicted_ratings[indices]
  cnt = 0
  for i in dec_order:
    print(map_id_movie[i],round(predicted_ratings[cnt],3))
    cnt += 1


In [18]:
movie_recomendation(4,10)

The evaluation at k = 1 equals to 0.7808988764044944
The evaluation at k = 2 equals to 0.5112359550561798
The evaluation at k = 3 equals to 0.4556803995006242
The evaluation at k = 4 equals to 0.42134831460674155
The evaluation at k = 5 equals to 0.43483146067415723
The evaluation at k = 6 equals to 0.43367665418227214
The evaluation at k = 7 equals to 0.42765420775051594
The evaluation at k = 8 equals to 0.4222261235955056
The evaluation at k = 9 equals to 0.41059786378138446
The evaluation at k = 10 equals to 0.4226966292134832
The evaluation at k = 11 equals to 0.43198068530039924
---------------------Final Best result at K => 9-------------------------------
Sliding Doors (1998) 5.0
Diva (1981) 5.0
Jack and Sarah (1995) 4.889
Boy's Life 2 (1997) 4.889
Wild Bill (1995) 4.889
Falling in Love Again (1980) 4.889
Killing Fields, The (1984) 4.889
Three Colors: Blue (1993) 4.889
Dunston Checks In (1996) 4.889
Big Sleep, The (1946) 4.889
