In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings('ignore')

In [46]:
dog_df = pd.read_csv("dog_df.csv")
dog_df

Unnamed: 0,dog_id,breed,size,activity,img
0,1,Afghan,3,3,
1,2,Airedale,3,2,
2,3,American Spaniel,2,2,
3,4,Basset,2,2,
4,5,Beagle,2,3,
5,6,Bearded Collie,2,3,
6,7,Bermaise,3,2,
7,8,Bichon Frise,1,2,
8,9,Blenheim,1,2,
9,10,Bloodhound,3,2,


In [47]:
dog_df.drop_duplicates(subset=['dog_id'], keep='first')

Unnamed: 0,dog_id,breed,size,activity,img
0,1,Afghan,3,3,
1,2,Airedale,3,2,
2,3,American Spaniel,2,2,
3,4,Basset,2,2,
4,5,Beagle,2,3,
5,6,Bearded Collie,2,3,
6,7,Bermaise,3,2,
7,8,Bichon Frise,1,2,
8,9,Blenheim,1,2,
9,10,Bloodhound,3,2,


In [48]:
user_df = pd.read_csv("user_df.csv")
user_df

Unnamed: 0,user_id,gender,age,occupation,place
0,1,m,21,1,1
1,2,m,22,1,1
2,3,m,24,2,1
3,4,m,25,2,1
4,5,m,26,2,1
...,...,...,...,...,...
295,296,f,59,5,2
296,297,f,65,5,2
297,298,f,68,5,2
298,299,f,69,5,2


In [79]:
user_df_duplicated = user_df[user_df.duplicated('user_id')]
user_df_duplicated.count()

user_id       0
gender        0
age           0
occupation    0
place         0
dtype: int64

In [50]:
user_df.isna().sum()

user_id       0
gender        0
age           0
occupation    0
place         0
dtype: int64

In [51]:
user_dog_df = pd.read_csv('user_dog_df.csv')
user_dog_df

Unnamed: 0,user_id,dog_id,rating
0,1,1,5
1,1,12,5
2,1,24,5
3,1,5,4
4,1,6,5
...,...,...,...
1550,300,8,5
1551,300,17,4
1552,300,35,5
1553,300,34,5


In [52]:
duplicate =user_dog_df[user_dog_df.duplicated(['user_id', 'dog_id'])]
duplicate.count()

user_id    27
dog_id     27
rating     27
dtype: int64

In [80]:
user_dog_df = user_dog_df.drop_duplicates(subset=['user_id', 'dog_id'], keep='first')

In [84]:
user_dog_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1528 entries, 0 to 1551
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  1528 non-null   int64
 1   dog_id   1528 non-null   int64
 2   rating   1528 non-null   int64
dtypes: int64(3)
memory usage: 47.8 KB


In [81]:
user_dog_df.isna().sum()

user_id    0
dog_id     0
rating     0
dtype: int64

In [85]:
merged_df = pd.merge(user_dog_df, dog_df, how='inner', on='dog_id')
merged_df

Unnamed: 0,user_id,dog_id,rating,breed,size,activity,img
0,1,1,5,Afghan,3,3,
1,2,1,5,Afghan,3,3,
2,4,1,5,Afghan,3,3,
3,5,1,5,Afghan,3,3,
4,6,1,5,Afghan,3,3,
...,...,...,...,...,...,...,...
1523,295,33,4,Maltese,1,2,
1524,296,33,5,Maltese,1,2,
1525,298,33,4,Maltese,1,2,
1526,299,33,5,Maltese,1,2,


### Training KNN model

Reshaping **user_dog_df** We want the data to be [breed, users] array. 

In [65]:
# pivot and create breed-user matrix
user_breed_df = merged_df.pivot(
    index='breed',
    columns='user_id',
      values='rating').fillna(0)

user_breed_df

user_id,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghan,5.0,5.0,0.0,5.0,5.0,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Airedale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Spaniel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Basset,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Beagle,4.0,5.0,5.0,5.0,4.0,5.0,5.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bearded Collie,5.0,0.0,5.0,0.0,5.0,0.0,4.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bermaise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bichon Frise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,0.0,5.0,5.0,0.0,5.0,5.0,4.0,5.0
Blenheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bloodhound,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
user_breed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Afghan to Yorkie
Columns: 287 entries, 1 to 300
dtypes: float64(287)
memory usage: 72.0+ KB


In [69]:
# transform matrix to scipy sparse matrix
user_breed_sparse_df = csr_matrix(user_breed_df.values)
user_breed_sparse_df

<32x287 sparse matrix of type '<class 'numpy.float64'>'
	with 1528 stored elements in Compressed Sparse Row format>

#### Fitting K-Nearest Neighbours model to the scipy sparse matrix:

In [70]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_breed_sparse_df)

NearestNeighbors(algorithm='brute', metric='cosine')

In [78]:
# function to find top n similar users of the given input user 
# input to this function is the user and number of top similar users you want.

def get_similar_users(user, n = 5):
    knn_input = user_breed_df.iloc[0,:].values.reshape(1,-1)
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
  
    print("Top",n,"users who are very much similar to the User-",user, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
        print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    return indices.flatten()[1:] + 1, distances.flatten()[1:]

#### Specify User id and Number of similar users we want to consider here

In [76]:
from pprint import pprint
user_id = 7
print(" Few of breeds seen by the User:")
pprint(list(merged_df[merged_df['user_id'] == user_id]['breed'])[:10])
similar_user_list, distance_list = get_similar_users(user_id,5)

 Few of breeds seen by the User:
['Borzoi',
 'Newfoundland',
 'Beagle',
 'Bearded Collie',
 'Border Collie',
 'Shar_Pei',
 'Pug',
 'Yorkie']
Top 5 users who are very much similar to the User- 7 are: 
 
1 . User: 2 separated by distance of 0.16857648563384753
2 . User: 10 separated by distance of 0.2385362777459925
3 . User: 13 separated by distance of 0.6301179604177494
4 . User: 21 separated by distance of 0.6388568592711092
5 . User: 12 separated by distance of 0.7250002401953355
