In [1]:
import pandas as pd
import numpy as np
import re
import ast
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup as bs
import urllib
import json
import cv2
import image_similarity_measures
from sys import argv
from image_similarity_measures.quality_metrics import rmse, ssim, sre, psnr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import math
import unicodedata
from urllib.request import Request, urlopen
import random
from matplotlib.colors import is_color_like

import sys
import warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

In [2]:
def url_to_image(url):
    
    user_agent_list = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36']
    user_agent = user_agent_list[random.randint(0,12)]
    HEADERS = {'User-Agent': user_agent}
    
    # download the image, convert it to a NumPy array, and then read it into OpenCV format
    req = Request(url, headers=HEADERS)
    resp = urlopen(req)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

def return_desc_similarity(data, attribute='description'):
    data[attribute] = data[attribute].fillna('')
    tfidf_matrix = TfidfVectorizer(stop_words='english').fit_transform(data[attribute]) 
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [3]:
# data is default as none, replace with any dataframe to get the results of recommendation
def combined_recommendation(url, data=None, N=4):
    
    def convert_dict(x):
        try: dic = json.loads(x.replace("\'", "\""))
        except: dic = {}
        return dic
    
    if isinstance(data['image_link_color'].values[0], str):
        data['image_link_color'] = data['image_link_color'].apply(lambda x: convert_dict(x))

    # get profuct info based on input url => use as filter
    cat = data[data['product_url'] == url]['low_level'].values[0]
    image_link_list = list(data[data['product_url'] == url]['image_link_color'].values[0].values())
    target_url = image_link_list[0]
    score = data[data['product_url'] == url]['overallscore'].values[0]

    # deal with empty values in gender columns in this case
    def change_gender(x):
        if x is None: return 'Unknown'
        else: return x
    data['gender'] = data['gender'].apply(lambda x: change_gender(x))
    gender = data[data['product_url'] == url]['gender'].values[0] 

    # get subdf
    subdf = data[(data['low_level'] == cat) & (data['overallscore'] >= score) & 
                 ((data['gender'] == gender) | (data['gender'] == 'Unisex'))]


    # IMAGE-BASED RECOMMENDATION
    img = argv[1]
    test_img = url_to_image(target_url)

    scale_percent = 100
    width = int(test_img.shape[1] * scale_percent / 100)
    height = int(test_img.shape[0] * scale_percent / 100)
    dim = (width, height)
    
    def return_img_similarity(image_dic):
        score_list = []
        image_link_list = list(image_dic.values())
        for image_link in image_link_list:
            try:
                data_img = url_to_image(image_link)
                resized_img = cv2.resize(data_img, dim, interpolation = cv2.INTER_AREA)

                # four measures
                ssim_score = ssim(test_img, resized_img)
                sre_score = sre(test_img, resized_img)
                rmse_score = rmse(test_img, resized_img)
                psnr_score = psnr(test_img, resized_img)

                # total measurements
                total_score = ssim_score + sre_score + psnr_score - rmse_score
                score_list.append(total_score)
            except: score_list.append(0)
        return score_list
    
    subdf['image_similarity'] = subdf['image_link_color'].apply(lambda x: return_img_similarity(x))
    subdf['image_sim'] = subdf['image_similarity'].apply(lambda x: max(x))
    m = subdf.loc[subdf['image_sim'] != np.inf, 'image_sim'].max()
    subdf['image_sim'] = subdf['image_sim'].replace(np.inf, m)
    subdf['image_sim_norm'] = (subdf['image_sim']-subdf['image_sim'].min())/(subdf['image_sim'].max()-subdf['image_sim'].min())
   
    # TEXT-BASED RECOMMENDATION
    subdf = subdf.reset_index()
    idx = subdf.index[subdf['product_url'] == url].values[0]
    # description
    desc_sim = return_desc_similarity(data = subdf, attribute='description')
    d_sim_scores = list(enumerate(desc_sim[idx]))
    d_sim_scores = sorted(d_sim_scores, key=lambda x: x[1], reverse=True)
    sim_df = pd.DataFrame(d_sim_scores, columns=['index','sim']).set_index('index')
    subdf['desc_similarity'] = subdf.index.to_series().map(sim_df['sim'])

    # title
    title_sim = return_desc_similarity(data = subdf, attribute='display_name')
    t_sim_scores = list(enumerate(title_sim[idx]))
    t_sim_scores = sorted(t_sim_scores, key=lambda x: x[1], reverse=True)
    t_sim_df = pd.DataFrame(t_sim_scores, columns=['index','sim']).set_index('index')
    subdf['title_similarity'] = subdf.index.to_series().map(t_sim_df['sim'])
    subdf['text_sim'] = subdf['desc_similarity'] + subdf['title_similarity']
    subdf['text_sim_norm'] = (subdf['text_sim']-subdf['text_sim'].min())/(subdf['text_sim'].max()-subdf['text_sim'].min())
    
    # combine text and image similarity
    subdf['combined_sim'] = subdf['image_sim_norm'] + subdf['text_sim_norm']

    # sort by image similarity and text similarity
    res = subdf.sort_values(['combined_sim'], ascending=False)[1:N+1]
    
    
    # display final recommended products
#     for idx, row in res_pro.iterrows():
#         print('Name:', row['display_name'])
#         try:
#             fig = plt.figure()
#             plt.imshow(url_to_image(list(row['image_link_color'].values())[0]), cmap = plt.cm.gray)
#             plt.axis("off")
#             plt.show()
#         except: print("---Picture is invalid.---")
#         print('Description:', row['description'])
#         print('URL:', row['product_url'])
#         print()
    
    return res

## Improvements:
1. The speed of the image-based recommendation algorithm is slow now, find other ways to increase the speed within current library.
2. Explore other algorithms to achieve fast and accurate image-based recommendation system.