# Importing the Libraries

In [2]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

# Loading the Dataset

In [4]:
df = pd.read_json('Magazine_Subscriptions.json', lines=True)
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,image
0,5,9.0,False,"11 8, 2001",AH2IFH762VY5U,B00005N7P0,ted sedlmayr,"for computer enthusiast, MaxPC is a welcome si...","AVID READER SINCE ""boot"" WAS THE NAME",1005177600,,
1,5,9.0,False,"10 31, 2001",AOSFI0JEYU4XM,B00005N7P0,Amazon Customer,Thank god this is not a Ziff Davis publication...,The straight scoop,1004486400,,
2,3,14.0,False,"03 24, 2007",A3JPFWKS83R49V,B00005N7OJ,Bryan Carey,Antiques Magazine is a publication made for an...,"Antiques Magazine is Good, but not for Everyone",1174694400,{'Format:': ' Print Magazine'},
3,5,13.0,False,"11 10, 2006",A19FKU6JZQ2ECJ,B00005N7OJ,Patricia L. Porada,This beautiful magazine is in itself a work of...,THE DISCERNING READER,1163116800,{'Format:': ' Print Magazine'},
4,5,,True,"07 14, 2014",A25MDGOMZ2GALN,B00005N7P0,Alvey,A great read every issue.,Five Stars,1405296000,,


In [5]:
df = df.loc[:, ['reviewerID', 'asin', 'overall']]

In [6]:
df = df.dropna(how='any', axis=0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89689 entries, 0 to 89688
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewerID  89689 non-null  object
 1   asin        89689 non-null  object
 2   overall     89689 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [8]:
num_users = len(df.reviewerID.unique())
num_items = len(df.asin.unique())
print('There are {} unique reviewers and {} unique magazines in this dataset'.format(num_users, num_items))

There are 72098 unique reviewers and 2428 unique magazines in this data set


# Building Item-Based Collaborative Filtering Recommender System With Cosine Similarity

## Data Preprocessing

In [9]:
df.drop_duplicates(ignore_index=True, inplace=True)

In [10]:
le = LabelEncoder()
df['productid'] = le.fit_transform(df['asin'])

In [11]:
asin_user_mat = df.pivot_table(index='productid', columns='reviewerID', values='overall').fillna(0)

In [12]:
asin_user_mat_sparse = csr_matrix(asin_user_mat.values)

In [13]:
asin_to_idx = {
    asin: i for i, asin in 
    enumerate(list(df.set_index('productid').loc[asin_user_mat.index].asin))
}

## Model Training

In [14]:
def fuzzy_matching(mapper, fav_magazine, verbose=True):
    match_tuple = []

    for asin, idx in mapper.items():
        ratio = fuzz.ratio(asin.lower(), fav_magazine.lower())
        if ratio >= 60:
            match_tuple.append((asin, idx, ratio))

    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [15]:
model_knn = NearestNeighbors( n_neighbors=20,  algorithm='brute', metric='cosine', n_jobs=-1)
model_knn.fit(asin_user_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [16]:
favourite_magazine = 'B00005N7P0' 
model_knn.fit(asin_user_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

## Making Recommendations

In [17]:
idx = fuzzy_matching(asin_to_idx, favourite_magazine, verbose=False)
distances, indices = model_knn.kneighbors(asin_user_mat_sparse[idx], n_neighbors=6)
raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
reverse_mapper = {v: k for k, v in asin_to_idx.items()}

In [18]:
print('\nInput Magazine:', favourite_magazine)
print('\nRecommendations for users who also browsed {}:\n'.format(favourite_magazine))
for i, (idx, dist) in enumerate(raw_recommends):
  print('{0}: {1}, with cosine distance of {2}'.format(i+1, df[df['productid'] == idx]['asin'].values[0], dist))


Input Magazine: B00005N7P0

Recommendations for users who also browsed B00005N7P0:

1: B00005N7UC, with cosine distance of 0.971007245181458
2: B00005Q7E7, with cosine distance of 0.9629139658390499
3: B00005UQ65, with cosine distance of 0.9619186392981828
4: B0032KHQXO, with cosine distance of 0.9341047244023177
5: B000F2BVK6, with cosine distance of 0.898123275037932
