# Recommendation system training using k nearest neighbour

In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

movies = "../data/movie_lens_small/movies.csv"
ratings = "../data/movie_lens_small/ratings.csv"

df_movies = pd.read_csv(movies, usecols=["movieId", "title"], dtype={"movieId":"int32", "title":"str"})
df_ratings = pd.read_csv(ratings, usecols=["userId", "movieId", "rating"], dtype={"userId":"int32", "movieId":"int32", "rating":"float32"})
df_ratings.head()



Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
## Spare Matrix
        
#         Users
#        [4,4,5] A
#movies  [3,3,4] B == Cos(A,B) => 0.95
#        [3,2,1]

movies_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0)
mat_movies_users = csr_matrix(movies_users.values)


In [4]:
print(movies_users)

userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193609   0.0  0.0  0.0  0.0 

In [60]:
mat_movies_users.shape, df_movies.shape

((9724, 610), (9742, 2))

In [32]:
movies_users.shape

(9724, 610)

In [5]:
# Euclidean Distance
# Manhattan Distance
# Minkowski Distance
# Cosine Similarity

model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20)

In [6]:
model_knn.fit(mat_movies_users)

In [7]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
df_movies[df_movies["title"] == "Shawshank Redemption, The (1994)"]

Unnamed: 0,movieId,title
277,318,"Shawshank Redemption, The (1994)"


In [25]:
movie_name = "aaaa"
print(process.extractOne(movie_name, df_movies["title"]))

('Returner (Ritaanaa) (2002)', 68, 4619)


In [10]:
model_knn.fit(mat_movies_users) # This is how we train the mode

In [62]:
# Recommender(movie_name) => List of Movies recommended 

def recommender(movie_name, data, model, n_recommendations) :
    idx = process.extractOne(movie_name, df_movies["title"])[2] # This gives the index to the data frame by finding the title that matches
    print("Movie Selected:" ,df_movies["title"][idx], "Index: ",idx )
    print("Searching for recommendations.....")
    distances, indices=model.kneighbors(data[idx], n_neighbors=n_recommendations) # This does the predictions
    for i in indices:
        print(df_movies["title"][i].where(i!=idx))

recommender("batman",mat_movies_users,model_knn,20)


Movie Selected: Batman Forever (1995) Index:  126
Searching for recommendations.....
126                                                  NaN
509                                        Batman (1989)
337                                     True Lies (1994)
302                    Ace Ventura: Pet Detective (1994)
378                                   Cliffhanger (1993)
508                            Dances with Wolves (1990)
138                    Die Hard: With a Vengeance (1995)
275                                      Stargate (1994)
506                                       Aladdin (1992)
307                      Clear and Present Danger (1994)
217    Interview with the Vampire: The Vampire Chroni...
9                                       GoldenEye (1995)
123                                     Apollo 13 (1995)
418                                 Jurassic Park (1993)
287                        Star Trek: Generations (1994)
398                                 Fugitive, The (1993)
197

In [59]:
def recommender(movie_name, data, model, n_recommendations) :
    idx = process.extractOne(movie_name, df_movies["title"])[2] # This gives the index to the data frame by finding the title that matches
    print("Movie Selected:" ,df_movies["title"][idx], "Index: ",idx )
    print("Searching for recommendations.....")
    distances, indices=model.kneighbors(data[idx], n_neighbors=n_recommendations) # This does the predictions
    for i in indices:
        for j in i:
            if j != idx:
                print(df_movies["title"][j])
recommender("batman",mat_movies_users,model_knn,20)

Movie Selected: Batman Forever (1995) Index:  126
Searching for recommendations.....
Beauty of the Day (Belle de jour) (1967)
Bridge on the River Kwai, The (1957)
Town & Country (2001)
Devil's Backbone, The (Espinazo del diablo, El) (2001)
Scrooged (1988)
Waco: The Rules of Engagement (1997)
Hunchback of Notre Dame, The (1939)
To Be or Not to Be (1942)
Bonfire of the Vanities (1990)
Draughtsman's Contract, The (1982)
Giant (1956)
Waking Ned Devine (a.k.a. Waking Ned) (1998)
Only the Strong (1993)
Sweet Bird of Youth (1962)
Cimarron (1931)
Crime and Punishment in Suburbia (2000)
Dolce Vita, La (1960)
Ilsa, She Wolf of the SS (1974)
Norma Rae (1979)
Memories (Memorîzu) (1995)


In [23]:
idx = process.extractOne(movie_name, df_movies["title"])[2]

In [15]:
df_merged = pd.merge(df_movies, df_ratings, on="movieId")
df_merged

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5
...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),184,4.0
100832,193583,No Game No Life: Zero (2017),184,3.5
100833,193585,Flint (2017),184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),184,3.5


In [14]:
print(df_movies[df_movies["title"] == "Shawshank Redemption, The (1994)"])
print(df_merged[df_merged["title"] == "Shawshank Redemption, The (1994)"]["rating"].sum())
movies_users[277].sum(), movies_users[318].sum()

     movieId                             title
277      318  Shawshank Redemption, The (1994)
1404.0


(100.0, 3301.5)

In [16]:
print(mat_movies_users[277].sum())

1404.0


In [19]:
print(movies_users[318])

movieId
1         0.0
2         3.5
3         0.0
4         0.0
5         0.0
         ... 
193581    0.0
193583    0.0
193585    0.0
193587    0.0
193609    0.0
Name: 318, Length: 9724, dtype: float32


In [40]:
def test(title = None, movieid = None):
    if title:
        match = df_movies[df_movies["title"] == "Toy Story (1995)"]
    if movieid:
        match = df_movies[df_movies["movieId"] == movieid]
        print("Match:", match["title"].values[0])

    idx = match.index[0]
    id = match["movieId"].values[0]
    print(f"{idx = }\n{id = }\n")

    dataframe = df_ratings[df_ratings["movieId"] == id]["rating"].sum() # all movieId's ratings summed
    matrix = mat_movies_users[idx].sum() # but using the sparse matrix, the vector is foind at idx

    print("Sum of all ratings:")
    print(f"{dataframe = }")
    print(f"{matrix = }")

    print(f"\nSame vector: {dataframe == matrix}")


test(movieid = 1)

Match: Toy Story (1995)
idx = 0
id = 1

Sum of all ratings:
dataframe = 843.0
matrix = 843.0

Same vector: True
