In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors # KNN Clustering 
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix # Compressed Sparse Row matrix
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
transactions = pd.read_csv('transactions.csv')
transactions.drop(columns = ['Unnamed: 0'],inplace=True)
transactions

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Month,Day,Hour,Sales,Frequency,total_product_purchases,category,sub_category,full_categ
0,536365,85123a,white hanging heart tlight holder,6.0,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,12,1,8,15.30,35,122.0,heart,holder,holder heart
1,536373,85123a,white hanging heart tlight holder,6.0,2010-12-01 09:02:00,2.55,17850.0,United Kingdom,12,1,9,15.30,35,122.0,heart,holder,holder heart
2,536375,85123a,white hanging heart tlight holder,6.0,2010-12-01 09:32:00,2.55,17850.0,United Kingdom,12,1,9,15.30,35,122.0,heart,holder,holder heart
3,536396,85123a,white hanging heart tlight holder,6.0,2010-12-01 10:51:00,2.55,17850.0,United Kingdom,12,1,10,15.30,35,122.0,heart,holder,holder heart
4,536406,85123a,white hanging heart tlight holder,8.0,2010-12-01 11:33:00,2.55,17850.0,United Kingdom,12,1,11,20.40,35,122.0,heart,holder,holder heart
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525624,581578,22993,set of 4 pantry jelly moulds,12.0,2011-12-09 12:16:00,1.25,12713.0,Germany,12,9,12,15.00,1,12.0,pantry,other,other pantry
525625,581578,22907,pack of 20 napkins pantry design,12.0,2011-12-09 12:16:00,0.85,12713.0,Germany,12,9,12,10.20,1,12.0,pack,pantry,pantry pack
525626,581578,22908,pack of 20 napkins red apples,12.0,2011-12-09 12:16:00,0.85,12713.0,Germany,12,9,12,10.20,1,12.0,pack,napkins,napkins pack
525627,581578,23215,jingle bell heart antique silver,12.0,2011-12-09 12:16:00,2.08,12713.0,Germany,12,9,12,24.96,1,12.0,heart,silver,silver heart


## Description + Category based Recommender System

In [3]:
def get_simple_recomendation_full(product,give_recs):
    try:
        product = re.sub('[^a-zA-Z0-9 ]', '', product).lower()
        col = ['Description','full_categ','total_product_purchases']

        content_df = transactions[col]
        content_df = content_df.dropna(axis=0)
        content_df = content_df.drop_duplicates(['Description'])
        #content_df = content_df[wine1['points'] > 85]

        content_pivot = content_df.pivot(index= 'Description',columns='full_categ',values='total_product_purchases').fillna(0)

        content_pivot_matrix = csr_matrix(content_pivot)

        knn = NearestNeighbors(n_neighbors=100, algorithm= 'auto', metric= 'cosine')
        model_knn = knn.fit(content_pivot_matrix)

        distance, indice = model_knn.kneighbors(content_pivot.loc[product].values.reshape(1,-1), n_neighbors=11)
        clients_antecedent = list(transactions[transactions['Description']==product]['CustomerID'].unique())    

        intersects =  []
        recomendations = []
        
        if give_recs == True:
            for i in range(0, len(distance.flatten())):
                if  i == 0:
                    print(f'Recommendation for {product}:')
                else:
                    clients_consequent = list(transactions[transactions['Description']==content_pivot.index[indice.flatten()[i]]]['CustomerID'].unique())
                    intersect = set(clients_antecedent).intersection(clients_consequent)
                    intersect_percent = round((len(intersect)/len(clients_antecedent)) *100,2)
                    print(f'{i}: {content_pivot.index[indice.flatten()[i]]}')
                    intersects.append(intersect_percent)
                    recomendations.append(content_pivot.index[indice.flatten()[i]])
            print('\n')
            print(f'Hit Rate: {mean(intersects)}')
        else:
            for i in range(0, len(distance.flatten())):
                if  i != 0:
                    clients_consequent = list(transactions[transactions['Description']==content_pivot.index[indice.flatten()[i]]]['CustomerID'].unique())
                    intersect = set(clients_antecedent).intersection(clients_consequent)
                    intersect_percent = round((len(intersect)/len(clients_antecedent)) *100,2)
                    intersects.append(intersect_percent)
                    recomendations.append(content_pivot.index[indice.flatten()[i]])
            

            return mean(intersects)
    except:
        return 0

In [4]:
products_list = list(transactions['Description'].unique())
global_hit_rate = []

for product in products_list:
    rate = get_simple_recomendation_full(product,give_recs=False)
    global_hit_rate.append(rate)
mean(global_hit_rate)

11.52774926340601

In [5]:
get_simple_recomendation_full('red hanging heart tlight holder',give_recs=True)

Recommendation for red hanging heart tlight holder:
1: heart tlight holder
2: folk art metal heart tlight holder
3: single heart zinc tlight holder
4: zinc  heart tlight holder
5: heart trellis triple tlight holder
6: hanging heart jar tlight holder
7: red hanging heart tlight holder
8: zinc heart lattice tlight holder
9: hanging heart zinc tlight holder
10: tlight holder silver heart handle


Hit Rate: 17.697


## Description + price + nºPurchases + month based Recommender System

In [6]:
def get_rec_CountVectorizer_full(product,give_recs):
    try:

        product = re.sub('[^a-zA-Z0-9 ]', '', product).lower()
        #select columns to be used
        features=['Description','UnitPrice','total_product_purchases','Month']
        df=transactions[features]

        #transform total_product_purchases values to the median number of purchases of a given product
        purchases_median = df.groupby('Description')['total_product_purchases'].median().to_frame()
        month_median = df.groupby('Description')['Month'].median().to_frame()
        df.drop(columns='total_product_purchases',inplace=True)
        df.drop(columns='Month',inplace=True)
        df = pd.merge(df,purchases_median, on='Description')
        df = pd.merge(df,month_median, on='Description')
        df.drop_duplicates(subset= 'Description',inplace=True)

        df = df.astype(str)

        def clean_data(x):
                return str.lower(x.replace(" ", ""))

        df['original_description'] = df['Description']
        for feature in features:
            df[feature] = df[feature].apply(clean_data)

        #join every column into one single concatenated word
        def create_soup(x):
            return x['Description']+ ' ' + x['UnitPrice'] + ' ' + 'total_product_purchases'+ ' ' + x['Month'] 
        df['soup'] = df.apply(create_soup, axis=1)
        count = CountVectorizer(stop_words='english')
        count_matrix = count.fit_transform(df['soup'])

        #calculate the cosine distances between the words
        cosine_sim = cosine_similarity(count_matrix, count_matrix)

        df=df.reset_index()
        indices = pd.Series(df.index, index=df['Description'])

        product_striped= product.replace(' ','').lower()
        idx = indices[product_striped]

        # Get the pairwsie similarity scores of all products with the given product
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the products based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # Get the scores of the 10 most similar products
        sim_scores = sim_scores[1:11]

        # Get the products indices
        product_indices = [i[0] for i in sim_scores]
        product_indices
        clients_antecedent = list(transactions[transactions['Description']==product]['CustomerID'].unique())    

        i=1
        intersects =  []
        recomendations = []
        for rec in  list(df['original_description'].iloc[product_indices]):
            clients_consequent = list(transactions[transactions['Description']==rec]['CustomerID'].unique())
            intersect = set(clients_antecedent).intersection(clients_consequent)
            intersect_percent = round((len(intersect)/len(clients_antecedent)) *100,2)
            intersects.append(intersect_percent)
            recomendations.append(rec)

        if give_recs == True:
            print(f'Recommendation for {product}:')
            for rec in recomendations:
                print(f'{i}: {rec}')
                i+=1 
            print('\n')
            print(f'Hit Rate: {mean(intersects)}')

        return mean(intersects)
    except:
        return 0

In [7]:
products_list = list(transactions['Description'].unique())
global_hit_rate = []

for product in products_list:
    rate = get_rec_CountVectorizer_full(product,give_recs=False)
    global_hit_rate.append(rate)
mean(global_hit_rate)

14.738352091926929

In [8]:
get_rec_CountVectorizer_full('red hanging heart tlight holder',give_recs=True)

Recommendation for red hanging heart tlight holder:
1: white hanging heart tlight holder
2: wooden frame antique white
3: set of 12  vintage postcard set
4: set of 6 vintage notelets kit
5: strawberry lunch box with cutlery
6: red retrospot shopping bag
7: pack of 72 retrospot cake cases
8: lunch box with cutlery retrospot
9: 60 cake cases dolly girl design
10: pack of 60 spaceboy cake cases


Hit Rate: 20.136


20.136

In [9]:
transactions.groupby('Country')['Sales'].sum()


Country
Australia               1.381713e+05
Austria                 8.616640e+03
Bahrain                 4.594000e+02
Belgium                 3.683559e+04
Brazil                  1.143600e+03
Canada                  2.581200e+03
Channel Islands         1.857544e+04
Cyprus                  1.323789e+04
Czech Republic          7.867400e+02
Denmark                 1.781964e+04
EIRE                    2.706498e+05
European Community      1.159250e+03
Finland                 1.824088e+04
France                  1.840808e+05
Germany                 2.048625e+05
Greece                  4.425520e+03
Hong Kong               9.949090e+03
Iceland                 4.310000e+03
Israel                  8.135260e+03
Italy                   1.580834e+04
Japan                   3.349234e+04
Lebanon                 1.693880e+03
Lithuania               1.598060e+03
Malta                   2.058090e+03
Netherlands             2.831873e+05
Norway                  3.245464e+04
Poland                  6.8887