# Required Imports

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
pd.options.display.max_rows=100

# EDA and data cleaning

In [None]:
# Reading the dataset

Luxury_Beauty = pd.read_csv("Luxury_Beauty.csv",header=None)
Luxury_Beauty.columns = ["product_id","user_id","rating"]

In [None]:
# Removing duplicate ratings

Luxury_Beauty = Luxury_Beauty.groupby(["product_id","user_id"]).agg({"rating":"mean"}).reset_index()

In [None]:
# Distribution of number of ratings per user (How many users have rated once/twice/etc?)

user_rating_agg = Luxury_Beauty.groupby("user_id").agg({"rating":"count"}).reset_index()
bins = [0,1,2,3,4,5,10,200]
labels = ['1 rating', '2 ratings', '3 ratings', '4 ratings','5 ratings','6-10 ratings','11+ ratings']
user_rating_agg["ratings_bins"] = pd.cut(user_rating_agg.rating,
                                         bins, labels = labels,
                                         include_lowest = False,
                                         right=True)
user_rating_agg.groupby("ratings_bins").agg({"rating":"count"}).plot(kind="bar").set_ylabel("# users")

In [None]:
# Segmenting into new and old users - users who've had fewer than 2 and more than 2 ratings

old_user_ids = Luxury_Beauty["user_id"].value_counts()[Luxury_Beauty["user_id"].value_counts()>2].index
new_user_ids = Luxury_Beauty["user_id"].value_counts()[Luxury_Beauty["user_id"].value_counts()<3].index
new_users_ratings = Luxury_Beauty[Luxury_Beauty["user_id"].isin(list(new_user_ids))]
old_users_ratings = Luxury_Beauty[Luxury_Beauty["user_id"].isin(list(old_user_ids))]

In [None]:
# Distribution of number of ratings per user - old users only

user_rating_agg = old_users_ratings.groupby("user_id").agg({"rating":"count"}).reset_index()
bins = [2,3,4,5,10,200]
labels = ['3 ratings', '4 ratings','5 ratings','6-10 ratings','11+ ratings']
user_rating_agg["ratings_bins"] = pd.cut(user_rating_agg.rating, bins, labels = labels,include_lowest = False, right=True)
user_rating_agg.groupby("ratings_bins").agg({"rating":"count"}).plot(kind="bar").set_ylabel("# users")

In [None]:
# Sense check
old_users_ratings["product_id"].nunique() # 8,308
old_users_ratings["user_id"].nunique() # 19,748
new_users_ratings["product_id"].nunique() # 11,784
new_users_ratings["user_id"].nunique() # 396,426

## Product segmentation

In [None]:
# Product wise average rating vs number of ratings made

# Grouping to a product level
product_rating_agg = old_users_ratings.groupby("product_id").agg({"rating":["count","mean"]}).reset_index()
product_rating_agg.columns = ['_'.join(col).strip() for col in product_rating_agg.columns.values]

# Subsetting for products which have atleast 30 ratings (for brevity) - this leaves only 595/8,308 products
#product_rating_agg_subset = product_rating_agg[product_rating_agg["rating_count"]>=30]

# Plotting
product_rating_agg.plot("rating_count","rating_mean",kind="scatter", figsize = (15,10))
plt.axvline(product_rating_agg["rating_count"].mean(),color='k', linestyle='dashdot', linewidth=2)
plt.axhline(product_rating_agg["rating_mean"].mean(),color='k', linestyle='dashdot', linewidth=2)

## User segmentation

In [None]:
# User wise average rating vs number of ratings available

# Grouping to a user level
user_rating_agg = old_users_ratings.groupby("user_id").agg({"rating":["count","mean"]}).reset_index()
user_rating_agg.columns = ['_'.join(col).strip() for col in user_rating_agg.columns.values]

# Plotting
user_rating_agg.plot("rating_count","rating_mean",kind="scatter", figsize = (15,10))
plt.axvline(user_rating_agg["rating_count"].mean(),color='k', linestyle='dashdot', linewidth=2)
plt.axhline(user_rating_agg["rating_mean"].mean(),color='k', linestyle='dashdot', linewidth=2)

## Creating the User-Item interaction table

In [None]:
# Pivoting data from long to wide format. Each row corresponds to one user and each column represents a product

ratings_pivoted = old_users_ratings.pivot(index = 'user_id',
                                                 columns ='product_id',
                                                 values = 'rating').fillna(0)

In [None]:
ratings_pivoted.head()

# Recommendation systems - Collaborative filtering

## 1. Model based (Matrix Factorisation using SVD)

In [None]:
ratings_matrix = ratings_pivoted.values # Converting the ratings into a matrix format

# De-meaning the rating for each user 
user_ratings_mean = np.mean(ratings_matrix, axis = 1)
ratings_matrix_demeaned = ratings_matrix - user_ratings_mean.reshape(-1, 1)
ratings_matrix_demeaned

In [None]:
# Performing Singular Value Decomposition 

U, sigma, Vt = svds(ratings_matrix_demeaned, k = 50)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
# Creating a dataframe of the predictions

preds_df = pd.DataFrame(all_user_predicted_ratings,
                        columns = ratings_pivoted.columns,
                        index=ratings_pivoted.index)
preds_df.head()

### Functionalizing recommendation

In [None]:
# Function to sort products with highest predicted rating for a given user

def recommend_products_1(user_id, num_recommendations=5):
    
    # Get and sort the user's predictions
    sorted_user_predictions = pd.DataFrame(preds_df.loc[user_id].sort_values(ascending=False)).reset_index()
    
    # Get the user's existing ratings
    user_data = old_users_ratings[old_users_ratings["user_id"] == user_id]
    
    # printing existing products and ratings
    print("The user has currently rated:")
    print(user_data)
    download_print_main_image(user_data)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = sorted_user_predictions[~sorted_user_predictions["product_id"].isin(user_data["product_id"])].iloc[:num_recommendations,:]
    print(f'\nRecommending highest {num_recommendations} predicted ratings products not already rated.')
    print(recommendations)
    download_print_main_image(recommendations)
    return None


In [None]:
# Function to download the image, store it and print it

def download_print_main_image(prod_ids):

    import urllib.request
    from IPython.display import Image, display
    for i in range(len(prod_ids["product_id"])):
        
        # Jugaad way of downloading the images. Has worked most of the times so far.
        url = "http://images.amazon.com/images/P/" + prod_ids["product_id"].iloc[i] + ".012_SCTZZZZZZZ_.jpg"
        img_name = "img" + str(i) + ".jpg"
        urllib.request.urlretrieve(url, img_name)
        
        # Printing image
        from IPython.display import Image
        display(Image(filename=img_name, width = 200))

In [None]:
recommend_products_1("A0488385844WNV2OWO9X",3)

In [None]:
recommend_products_1("A0067293ETUPO6WG7DKU",4)

In [None]:
def coverage(func):
    for user in old_users_ratings["user_id"].unique():
        func(user)

In [None]:
old_users_ratings["user_id"].unique()