In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr
from random import sample
import random
from sklearn.metrics import jaccard_score
random.seed(24)

In [2]:
class Data:
    def __init__(self):
        '''init Data class'''
        self.data = None
        
    def load_data(self, filename, format='txt'):
        '''loads data from excel, csv, tsv, or txt file'''
        if format == 'excel':
            self.data = pd.read_excel(filename)
        elif format == 'csv':
            self.data = pd.read_csv(filename)
        elif format == 'tsv':
            self.data = pd.read_csv(filename, sep='\t')
        elif format == 'txt':
            self.data = pd.read_table(filename)
        else:
            raise ValueError('Invalid file format.  Please specify "excel", "csv", "tsv", or "txt".')
    
    def drop_small_orders(self, order_col='order_number', min_order_size=2):
        '''drop orders from self.data that have min_order_size or less unique items in basket'''
        self.data = self.data[self.data.groupby(order_col).order_number.transform(len) >= min_order_size]
    
    def expand_columns(self, columns=[]):
        '''performs one-hot encoding on specified columns'''
        self.data = pd.get_dummies(self.data, columns = columns,prefix="", prefix_sep="")
    

    def drop_columns(self, columns=[]):
        '''drops columns from self.data'''
        self.data.drop(columns, axis=1, inplace=True)
        
    def sum_orders(self, order_col='order_number'):
        '''This method groups all the order numbers together and then sums up all the items in the same basket
        - because currently each row has one product in the basket, want to group them together -removed the 
        quantity for simplisitic purposes
        '''
        data_cols = list(data.data.columns)
        data_cols.remove(order_col)
        self.data = self.data.groupby(order_col).sum()[data_cols].reset_index()
        
    def drop_duplicates(self):
        """Dont need multiple orders of the same product in the same basket"""
        self.data = self.data.drop_duplicates()

In [3]:
data = Data()
data.load_data('Ecorp_data/All Transations - 2 Weeks.txt', format='tsv')
data.drop_columns(columns = ['l1', 'l2', 'sku', 'brand'])
data.drop_duplicates()

In [4]:
data.drop_small_orders(min_order_size=10)

In [6]:
l = len(np.unique(data.data['order_number'])) #length of data 
f = int(0.8*l)  #number of elements you need
indices = sample(range(l),f)
train_indices = np.unique(data.data['order_number'])[indices]
test_indices = np.delete(np.unique(data.data['order_number']),indices)

In [7]:
test = data.data[data.data['order_number'].isin(test_indices)]
data.data = data.data[data.data['order_number'].isin(train_indices)]

In [8]:
data.expand_columns(columns = ['l3'])

In [9]:
data.sum_orders()

In [10]:
test_dictionary = {}
for x in np.unique(test['order_number']):
    basket = list(test[test['order_number'] == x]['l3'])
    predicting_value = random.randint(0,len(basket)-1)
    predicting_product = basket[predicting_value]
    basket = np.delete(basket, predicting_value)
    tup = basket, predicting_product
    test_dictionary[x] = tup

In [169]:
class Recommender:
    def __init__(self, data, user_col, item_cols, cf_method='item', similarity='pearson'):
        '''init Recommender class'''
        self.data = data
        self.user_col = user_col
        self.item_cols = item_cols
        self.cf_method = cf_method
        self.similarity = similarity
        self.similarity_matrix = []
        self.user_scores = []
        self.recs = []
        self.complementary_products = {}

    def create_similarity_matrix(self):
        '''creates correlation/similarity matrix for all items and stores result and self.similarity_matrix'''
        self.similarity_matrix = self._create_empty_df(self.cf_method)
        self._fill_similarity_matrix(self.similarity_matrix, self.similarity)

  
    def _create_empty_df(self, cf_type):
        '''creates and returns empty df with users or items as rows and columns'''
        if cf_type == 'item':
            labels = self.item_cols
        elif cf_type == 'user':
            labels = self.data[user_col]
        else:
            raise ValueError('Invalid collaborative filtering technique.  Please specify "item" or "user".')
        return pd.DataFrame(index=labels, columns=labels)

    def _fill_similarity_matrix(self, similarity_matrix, similarity):
        '''calculates correlation between items using specified similarity and saves results in similarity_matrix
           valid similarity types: jaccard, pearson, cosine'''
        k=0
        ###this is something that might need to change
        item_df = self.data[self.item_cols] 
        #print(item_df)
        progress_bar = tqdm(total = similarity_matrix.shape[0], mininterval=5)
        for i in range(similarity_matrix.shape[0]):
            progress_bar.update()
            similarity_matrix.iloc[i,i] = 1.0
            x = item_df.iloc[:,i]
            for j in range(i,similarity_matrix.shape[1]):
                y = item_df.iloc[:,j]
                similarity_matrix.iloc[i,j] = self._get_similarity(x, y, similarity)
                similarity_matrix.iloc[j,i] = similarity_matrix.iloc[i, j]
                
    def _get_similarity(self, x, y, similarity):
        '''calculated specified correlation between two vectors and returns result'''
        if similarity == 'pearson':
            return self._pearson_similarity(x, y)
        elif similarity == 'jaccard':
            return self._jaccard_similarity(x, y)
        elif similarity == 'cosine':
            return self._cosine_similarity(x, y)
        elif similarity == 'frequency':
            return self._frequency_similarity(x, y)
        else:
            raise ValueError('Invalid similarity type.  Please specify "cosine", "pearson","frequency", or "jaccard".')
        
    def _pearson_similarity(self, x, y):
        '''returns pearson correlation between x and y: covariance(x,y)/(std_dev(x)*std_dev(y))'''
        coefficent = 0
        x = np.array(x)
        y = np.array(y)
        x_indices = np.argwhere(x==1)
        if(len(x_indices) >0):
            y_values = y[x_indices]
            y_indices = np.where(y_values == 1)  
            if(len(y_indices) > 0):
                coefficent = pearsonr(x,y)[0]
        return coefficent
    
    def _jaccard_similarity(self,x, y):
        x = np.array(x)
        y = np.array(y)
        nonzero_x = set(np.nonzero(x)[0])
        nonzero_y = set(np.nonzero(y)[0])
        intersection_size = len(nonzero_x.intersection(nonzero_y))
        union_size = len(nonzero_x.union(nonzero_y))
        if union_size == 0 or intersection_size ==0:
            return 0
        else:
            return intersection_size/union_size
        
    def _frequency_similarity(self,x,y):
        x = np.array(x)
        y = np.array(y)
        x_indices = np.argwhere(x==1)
        if(len(x_indices) >0):
            y_values = y[x_indices]
            y_indices = np.where(y_values == 1)  
            return len(y_indices)
        return 0
    
    

    ###this method returns the top five products that are complemented with the 
    ###the product
    
    def shopping_complementary_products(self, size = 5):
        for x in range(0,len(self.similarity_matrix.columns)):
            column_name = self.similarity_matrix.columns[x]
            columns = np.delete(self.similarity_matrix.columns,x)
            correlations = self.similarity_matrix[column_name]
            correlations = np.delete(list(correlations), x)
            sorted_indexes = np.argsort(correlations)[::-1]
            correlations = correlations[sorted_indexes]
            columns = columns[sorted_indexes]
            indexes = np.argwhere(correlations > 0.3).flatten()
            length = len(indexes)
            if(length >0):
                if(length <size):
                    size = length
                correlations = correlations[indexes][0:size]
                columns = columns[indexes][0:size]
                array = list(zip(columns, correlations, range(1,size+1)[::-1]))
                self.complementary_products[column_name] = array
            
    def complementary_product(self, product = ""):
        complements = self.complementary_products.get(product)
        if complements!= None: 
            return complements
        return None
    
    
    def recommended_basket(self, shopping_basket = []):
        ranking_dictinary = {}
        for product in shopping_basket:
            if(self.complementary_product(product) != None):
                for item, correlation, rank in self.complementary_product(product):
                    if item not in shopping_basket:
                        if ranking_dictinary.get(item) == None:
                            if(self.similarity != 'frequency'):
                                ranking_dictinary[item] = rank
                            else:
                                ranking_dictinary[item] = correlation
                        else:
                            if(self.similarity != 'frequency'):
                                ranking_dictinary[item] += rank
                            else:
                                ranking_dictinary[item] += correlation
        ranking_dictinary = sorted(ranking_dictinary.items(), key=lambda x: x[1], reverse=True)
        if(len(ranking_dictinary) !=0):
            items = list(zip(*ranking_dictinary))[0]
            if(len(items) !=0):
                if(len(items) >5):
                    items = items[0:5]

            return items
        else:
            return None
        

        
                
        
        
#         coefficent = jaccard_score(x, y)
#         return coefficent 
        
    
#         covariance_matrix = np.cov(x,y)
#         std_dev_x = np.std(x)
#         std_dev_y = np.std(y)
#         numerator = covariance_matrix[0][1]
#         denominator = std_dev_x*std_dev_y
#         coefficent = numerator/denominator

#         sum_x_y = np.dot(x,y)
#         sum_x = np.sum(x)
#         sum_y = np.sum(y)
#         length = len(x)
#         numerator = (length*sum_x_y) - (sum_x*sum_y)
#         x_squared = x**2
#         y_squared = y**2
#         denominator_1 = np.sum(x_squared)*length - sum_x**2
#         denominator_2 = np.sum(y_squared)*length - sum_y**2
#         denominator = float(denominator_1*denominator_2)**(1/2)
#         coefficent = float(round(numerator/denominator, 3))
#         return coefficent

#     def _jaccard_similarity(self, x, y):
#         '''returns jaccard correlation between x and y: |intsection(x,y)|/|union(x,y)|'''
#         #ideal for binary data, e.g. buy vs non-buy
#         nonzero_x = set(np.nonzero(x)[0])
#         nonzero_y = set(np.nonzero(y)[0])
#         intersection_size = len(nonzero_x.intersection(nonzero_y))
#         union_size = len(nonzero_x.union(nonzero_y))
#         if union_size == 0:
#             return 0
#         else:
#             return intersection_size/union_size

#     def _cosine_similarity(self, x, y):
#         '''returns cosine of angles between x and y'''
#         pass
    

    

In [170]:
user_col = 'order_number'
item_cols = list(data.data.columns)
item_cols.remove(user_col)

In [171]:
rec_engine = Recommender(data.data, user_col=user_col, item_cols=item_cols, cf_method='item', similarity='frequency')

In [172]:
# rec_engine.create_similarity_matrix()

In [173]:
# rec_engine.similarity_matrix.to_csv('item_similarity.csv')

In [174]:
# rec_engine.similarity_matrix.to_csv('item_similarity_jaccard.csv')
rec_engine.similarity_matrix = pd.read_csv('item_similarity_frequency.csv', index_col = 0)

In [175]:
rec_engine.similarity_matrix.head()

Unnamed: 0,12 Volt Accessories,12-Point Flange Head Cap Screws,3-Ring Binder Accessories,3-Ring Binders,3.3 Inch Diameter Motors,4.4 Inch Diameter Motors,5 X 20mm Glass and Ceramic Fuses,A/C Conduit Kits,A/C Mounting Pads,A/C Refrigeration Accessories,...,Workbench Pedestals,Workbench Tops,Workstands,Worm Gear Clamps,Wrap-a-Round Tape Measures,Wrist Rests and Palm Supports,Wrist Supports and Wraps,Y Strainers,Yard Hydrants,pH Meters
12 Volt Accessories,8,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
12-Point Flange Head Cap Screws,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3-Ring Binder Accessories,0,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3-Ring Binders,0,0,1,11,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3.3 Inch Diameter Motors,1,0,0,0,15,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [176]:
rec_engine.shopping_complementary_products()

In [185]:
####examples

In [177]:
rec_engine.complementary_product(product = '3-Ring Binders')

[('Pens', 9, 5),
 ('Markers', 8, 4),
 ('Sticky Notes and Flags', 7, 3),
 ('Binder Clips', 6, 2),
 ('Pencils', 6, 1)]

In [190]:
rec_engine.complementary_product(product = '5 X 20mm Glass and Ceramic Fuses')

[('American Glass and Ceramic Fuses', 35, 5),
 ('Cable Ties', 31, 4),
 ('Hex Head Cap Screws', 30, 3),
 ('Flat Washers', 28, 2),
 ('Jobber Length Drill Bits', 24, 1)]

In [191]:
rec_engine.complementary_product(product = 'Wrist Supports and Wraps')

[('Safety Glasses', 34, 5),
 ('Coated Gloves', 19, 4),
 ('Leather Work Gloves', 16, 3),
 ('Cut-Resistant Sleeves', 15, 2),
 ('Cut-Resistant Gloves', 15, 1)]

In [178]:
rec_engine.complementary_product(product = 'Disposable Gloves')

[('Safety Glasses', 997, 5),
 ('Standard Batteries', 732, 4),
 ('Ear Plugs', 667, 3),
 ('Trash Bags', 597, 2),
 ('Paper Towels, Rolls', 574, 1)]

In [184]:
rec_engine.complementary_product(product = 'Disinfectants and Sanitizers')

[('Paper Towels, Rolls', 376, 5),
 ('Toilet Paper', 359, 4),
 ('Trash Bags', 358, 3),
 ('Disposable Gloves', 260, 2),
 ('Wet Mops', 172, 1)]

In [182]:
rec_engine.recommended_basket(['Disposable Gloves','Safety Glasses','Ear Plugs','Standard Batteries'])

('Cut-Resistant Gloves',
 'Coated Gloves',
 'Trash Bags',
 'Paper Towels, Rolls',
 'Leather Work Gloves')

In [192]:
import ipywidgets as widgets

In [193]:
data = pd.read_csv('Ecorp_data/All Transations - 2 Weeks.txt', sep='\t')

In [203]:
unique_products = np.unique(data['l3'])
unique_products = np.insert(unique_products, 0, 'nothing', axis=0)

In [214]:
dropdown_product_compliment = widgets.Dropdown(options= unique_products,
    description='Products:',
    disabled=False)

value = []

def dropdown_product_compliment_eventhandler(change):
    if (change.new != 'nothing'):
        if'index' not in str(change.new):
            if str(change.new) in unique_products and str(change.new) not in value:
                value.append(str(change.new))
                print(str(change.new))
                print('Rec engine recommends')
                print(rec_engine.complementary_product(product = str(change.new)))
dropdown_product_compliment.observe(dropdown_product_compliment_eventhandler)

Can experiment with compliment products

In [216]:
dropdown_product_compliment

Dropdown(description='Products:', index=1, options=('nothing', '12 Volt Accessories', '12-Point Flange Head Ca…

Badminton
Rec engine recommends
None
Baking Cups
Rec engine recommends
[('Kitchenware Tumblers and Shakers', 1, 5), ('Glass Cleaners', 1, 4), ('Hand Sanitizer, Lotion, and Soap Dispensers', 1, 3), ('Aluminum Foil and Film Rolls', 1, 2), ('Serving Utensils', 1, 1)]
Battery Accessories
Rec engine recommends
[('Flare Nut Wrenches', 5, 5), ('Socket Adapters', 5, 4), ('Socket Extensions', 5, 3), ('Sockets', 5, 2), ('Cable and Wire Cutters', 4, 1)]


In [223]:
dropdown_product_compliment = widgets.Dropdown(options= unique_products,
    description='Products:',
    disabled=False)

value = []

def dropdown_product_compliment_eventhandler(change):
    if (change.new != 'nothing'):
        if'index' not in str(change.new):
            if str(change.new) in unique_products and str(change.new) not in value:
                value.append(str(change.new))
                print(str(change.new))
dropdown_product_compliment.observe(dropdown_product_compliment_eventhandler)

Basket Analysis

In [229]:
dropdown_product_compliment

Dropdown(description='Products:', options=('nothing', '12 Volt Accessories', '12-Point Flange Head Cap Screws'…

3-Ring Binder Accessories
3D Printing Materials
50 Hz Motors


In [231]:
button = widgets.Button(description="Recommend for basket")
output = widgets.Output()

display(button, output)

def on_button_clicked(b):
    with output:
        print("Based on Basket, rec engine recommends")
        print(rec_engine.recommended_basket(value))

button.on_click(on_button_clicked)

Button(description='Recommend for basket', style=ButtonStyle())

Output()