In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

In [4]:
testset = pd.read_csv('Recommendation_dataset.csv',header= None)
testset.columns = ['user_id', 'product_id', 'ratings', 'timestamp'] 
testset = testset.drop('timestamp', axis=1)
testset_copy = testset.copy(deep=True)

In [5]:
testset.head()

Unnamed: 0,user_id,product_id,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [7]:
rows, columns = testset.shape[0], testset.shape[1]
print("No of rows in testset: ", rows) 
print("No of columns in testset: ", columns)

No of rows in testset:  7824482
No of columns in testset:  3


In [8]:
testset.dtypes

user_id        object
product_id     object
ratings       float64
dtype: object

In [9]:
testset.describe()

Unnamed: 0,ratings
count,7824482.0
mean,4.012337
std,1.38091
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [10]:
testset.isna().sum()

user_id       0
product_id    0
ratings       0
dtype: int64

In [11]:
print('Number of unique users in dataset = ', testset['user_id'].nunique())
print('Number of unique items in dataset = ', testset['product_id'].nunique())

Number of unique users in dataset =  4201696
Number of unique items in dataset =  476002


In [12]:
most_rated = testset.groupby('user_id').size().sort_values(ascending=False)[:10]
most_rated

user_id
A5JLAU2ARJ0BO     520
ADLVFFE4VBT8      501
A3OXHLG6DIBRW8    498
A6FIAB28IS79      431
A680RUE1FDO8B     406
A1ODOGXEYECQQ8    380
A36K2N527TXXJN    314
A2AY4YUOX2N1BQ    311
AWPODHOB4GFWL     308
A25C2M3QF9G7OQ    296
dtype: int64

In [13]:
counts = testset['user_id'].value_counts()
testset_final = testset[testset['user_id'].isin(counts[counts >= 50].index)]

In [14]:
print('The number of observations in the final data =', len(testset_final))

The number of observations in the final data = 125871


In [17]:
final_rating_matrix = testset_final.pivot(index = 'user_id', columns ='product_id', values = 'ratings').fillna(0)

In [18]:
print('Shape of final_ratings_matrix: ', final_rating_matrix.shape)

Shape of final_ratings_matrix:  (1540, 48190)


In [19]:
given_num_of_rating = np.count_nonzero(final_rating_matrix)
print('given_num_of_rating = ', given_num_of_rating)
possible_num_of_rating = final_rating_matrix.shape[0] * final_rating_matrix.shape[1]
print('possible_num_of_rating = ', possible_num_of_rating)
density = (given_num_of_rating/possible_num_of_rating)
density *= 100
print ('density: {:4.2f}%'.format(density))
final_rating_matrix.head()

given_num_of_rating =  125871
possible_num_of_rating =  74212600
density: 0.17%


product_id,0594451647,0594481813,0970407998,0972683275,1400501466,1400501520,1400501776,1400532620,1400532655,140053271X,...,B00L5YZCCG,B00L8I6SFY,B00L8QCVL6,B00LA6T0LS,B00LBZ1Z7K,B00LED02VY,B00LGN7Y3G,B00LGQ6HL8,B00LI4ZZO8,B00LKG1MC8
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105TOJ6LTVMBG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10AFVU66A79Y1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
average_rating = testset_final.groupby(['product_id']).mean('ratings')
print(average_rating.head())

             ratings
product_id          
0594451647  5.000000
0594481813  3.000000
0970407998  2.500000
0972683275  4.750000
1400501466  3.333333


In [36]:
count_rating = testset_final.groupby(['product_id']).count().ratings
final_rating = pd.DataFrame(pd.concat([average_rating,count_rating], axis = 1))
final_rating.columns=["Average Rating", "Rating Count"]
final_rating = final_rating.sort_values(by='Average Rating', ascending=False)
final_rating.head()

Unnamed: 0_level_0,Average Rating,Rating Count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0594451647,5.0,1
B003RRY9RS,5.0,1
B003RR95Q8,5.0,1
B003RIPMZU,5.0,1
B003RFRNYQ,5.0,2


In [41]:
def top_n_product(final_rating, n, min_interaction):
    recommendation = final_rating[final_rating['Rating Count'] >= min_interaction]    
    recommendation = recommendation.sort_values(by='Average Rating', ascending=False)
    return recommendation.index[:n]

In [42]:
list(top_n_product(final_rating, 5, 50))

['B001TH7GUU', 'B003ES5ZUU', 'B0019EHU8G', 'B006W8U2MU', 'B000QUUFRW']

In [43]:
final_rating_matrix['user_index'] = np.arange(0, final_rating_matrix.shape[0])
final_rating_matrix.set_index(['user_index'], inplace=True)
final_rating_matrix.head()

product_id,0594451647,0594481813,0970407998,0972683275,1400501466,1400501520,1400501776,1400532620,1400532655,140053271X,...,B00L5YZCCG,B00L8I6SFY,B00L8QCVL6,B00LA6T0LS,B00LBZ1Z7K,B00LED02VY,B00LGN7Y3G,B00LGQ6HL8,B00LI4ZZO8,B00LKG1MC8
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
def similar_users(user_index, interactions_matrix):
    similarity = []
    for user in range(0, interactions_matrix.shape[0]):
        sim = cosine_similarity([interactions_matrix.loc[user_index]], [interactions_matrix.loc[user]])
        similarity.append((user, sim))   
    similarity.sort(key=lambda x: x[1], reverse=True)
    most_similar_users = [Tuple[0] for Tuple in similarity] 
    similarity_score = [Tuple[1] for Tuple in similarity]    
    most_similar_users.remove(user_index)
    similarity_score.remove(similarity_score[0])
    return most_similar_users, similarity_score

In [46]:
similar = similar_users(3, final_rating_matrix)[0][0:10]
similar

[320, 12, 793, 261, 156, 1493, 1250, 567, 753, 1360]

In [47]:
def recommendations(user_index, num_of_products, interactions_matrix):
    most_similar_users = similar_users(user_index, interactions_matrix)[0]
    prod_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[user_index] > 0)]))
    recommendations = []
    observed_interactions = prod_ids.copy()
    for similar_user in most_similar_users:
        if len(recommendations) < num_of_products:
            similar_user_prod_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[similar_user] > 0)]))
            recommendations.extend(list(similar_user_prod_ids.difference(observed_interactions)))
            observed_interactions = observed_interactions.union(similar_user_prod_ids)
        else:
            break
    
    return recommendations[:num_of_products]

In [52]:
recommendations(5, 5, final_rating_matrix)

['B00CL8F98W', 'B003FMVPFY', 'B002VS4V0Y', 'B00HFAEBWG', 'B00BQ4F9ZA']

In [51]:
recommendations(1400, 5, final_rating_matrix)

['B00007E7K1', 'B0020HRCB6', 'B0043D2L70', 'B001SVJUPG', 'B004603DTA']