In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Read in the individual csv files, and store as separate Pandas Dataframes
aisles = pd.read_csv('C:/Users/Vikram Pande/Desktop/aisles.csv')
departments = pd.read_csv('C:/Users/Vikram Pande/Desktop/departments.csv')
order_products__prior = pd.read_csv('C:/Users/Vikram Pande/Desktop/order_products__prior.csv')
order_products__train = pd.read_csv('C:/Users/Vikram Pande/Desktop/order_products__train.csv')
products = pd.read_csv('C:/Users/Vikram Pande/Desktop/products.csv')

<b>N.B. Ordnarily, I'd perform the below step in a DB using SQL (or run Spark SQL queries, if our data is stored in S3 buckets, or similar), and assuming the DB uses an ODBC driver, connect the Python Interpretor to the DB using pyodbc, and paste that query directly into this notebook. But, for the context of this assessment, I'll use the Pandas merge function to perform a similar operation to join these tables with their appropriate join predicates.</b>

In [3]:
# Merge data sets as a left join using product_id, department_id, and aisle_id as join predicates
merged_products = pd.merge(products, aisles, on='aisle_id', how='left')
merged_products = pd.merge(merged_products, departments, on='department_id', how='left')
merged_products = pd.merge(merged_products, order_products__train, on='product_id', how='left')
merged_products = merged_products.reindex(columns=['order_id','product_id','product_name','aisle_id','aisle','department_id','department', 'add_to_cart_order', 'reordered'])
merged_products

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,add_to_cart_order,reordered
0,6695.0,1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks,7.0,1.0
1,48361.0,1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks,9.0,0.0
2,63770.0,1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks,4.0,0.0
3,75339.0,1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks,9.0,0.0
4,240996.0,1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks,3.0,1.0
...,...,...,...,...,...,...,...,...,...
1395177,1092104.0,49687,Smartblend Healthy Metabolism Dry Cat Food,41,cat food care,8,pets,1.0,0.0
1395178,655800.0,49688,Fresh Foaming Cleanser,73,facial care,11,personal care,10.0,1.0
1395179,2198380.0,49688,Fresh Foaming Cleanser,73,facial care,11,personal care,10.0,0.0
1395180,2508423.0,49688,Fresh Foaming Cleanser,73,facial care,11,personal care,3.0,0.0


In [4]:
# Delete all prior dataframes, except for the new 'merged_products', to save on memory
del aisles, departments, order_products__prior, order_products__train, products

In [5]:
# Selecting the product name, department, and aisle dimensions as features
features = merged_products[['product_name', 'aisle', 'department']]

# Due to errors caused by memory limitations during pre-processing in the following cells, limiting the feature selection to the first 50,000 rows
features = features.loc[0:50000]
features

Unnamed: 0,product_name,aisle,department
0,Chocolate Sandwich Cookies,cookies cakes,snacks
1,Chocolate Sandwich Cookies,cookies cakes,snacks
2,Chocolate Sandwich Cookies,cookies cakes,snacks
3,Chocolate Sandwich Cookies,cookies cakes,snacks
4,Chocolate Sandwich Cookies,cookies cakes,snacks
...,...,...,...
49996,Organic Ground Flaxseed,vitamins supplements,personal care
49997,Organic Ground Flaxseed,vitamins supplements,personal care
49998,Perfecto,fresh dips tapenades,deli
49999,Perfecto,fresh dips tapenades,deli


<b>IMPORTANT N.B. The 'for' loop in the following cell is iterating through 50,000 rows of text data, it takes a long time to compute - even on a high spec machine.</b>

In [6]:
# Combine the features together into a new column to create a Bag of Words
for index, row in features.iterrows():
    features['combined_features'] = features['product_name'] + ' ' + features['aisle'] + ' ' + features['department']

features

Unnamed: 0,product_name,aisle,department,combined_features
0,Chocolate Sandwich Cookies,cookies cakes,snacks,Chocolate Sandwich Cookies cookies cakes snacks
1,Chocolate Sandwich Cookies,cookies cakes,snacks,Chocolate Sandwich Cookies cookies cakes snacks
2,Chocolate Sandwich Cookies,cookies cakes,snacks,Chocolate Sandwich Cookies cookies cakes snacks
3,Chocolate Sandwich Cookies,cookies cakes,snacks,Chocolate Sandwich Cookies cookies cakes snacks
4,Chocolate Sandwich Cookies,cookies cakes,snacks,Chocolate Sandwich Cookies cookies cakes snacks
...,...,...,...,...
49996,Organic Ground Flaxseed,vitamins supplements,personal care,Organic Ground Flaxseed vitamins supplements p...
49997,Organic Ground Flaxseed,vitamins supplements,personal care,Organic Ground Flaxseed vitamins supplements p...
49998,Perfecto,fresh dips tapenades,deli,Perfecto fresh dips tapenades deli
49999,Perfecto,fresh dips tapenades,deli,Perfecto fresh dips tapenades deli


In [7]:
# Index the product_name column for later use by the recommender function
features.set_index('product_name', inplace=True)

<b>N.B. During the following feature extraction steps, since there aren't any stopwords in the df, we'll keep the stopwords argument in CountVectorizer() and TfidVectorizer() as default=None</b>

In [8]:
# Feature extraction using the Count Vectoriser.
cv = CountVectorizer(analyzer='char_wb')
count_vector_matrix = cv.fit_transform(features['combined_features'])

# Converting to a Pandas Series and index so as to match indexes with product names
indices = pd.Series(features.index)

In [67]:
# Check out some of the product entries to run as test cases for the recommender function
indices.unique()

array(['Chocolate Sandwich Cookies', 'All-Seasons Salt',
       'Robust Golden Unsweetened Oolong Tea', ..., 'Bubblemint Gum',
       'Organic Ground Flaxseed', 'Perfecto'], dtype=object)

In [9]:
# Instantiate Cosine Similarity model using the Count Vectoriser feature extraction technique, and generate the array
cosine_sim_cvm = cosine_similarity(count_vector_matrix)
cosine_sim_cvm

array([[1.        , 1.        , 1.        , ..., 0.79414703, 0.79414703,
        0.79414703],
       [1.        , 1.        , 1.        , ..., 0.79414703, 0.79414703,
        0.79414703],
       [1.        , 1.        , 1.        , ..., 0.79414703, 0.79414703,
        0.79414703],
       ...,
       [0.79414703, 0.79414703, 0.79414703, ..., 1.        , 1.        ,
        1.        ],
       [0.79414703, 0.79414703, 0.79414703, ..., 1.        , 1.        ,
        1.        ],
       [0.79414703, 0.79414703, 0.79414703, ..., 1.        , 1.        ,
        1.        ]])

In [10]:
# Create function for parsing recommendations from the Cosine Similarity array into a readable product name
def recommender_cvm(product, cosine_sim = cosine_sim_cvm):
    
    recommended_products = []

    # Grab the index of the product that matches the product_name
    idx = indices[indices == product].index[0]

    # Create a series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)

    # Grab the top 10 most similar products
    top_10_products = list(score_series.iloc[0:10].index)

    # Populate the list with the top 10 recommended products
    for i in top_10_products:
        recommended_products.append(list(features.index)[i])
    
    return recommended_products

In [11]:
# Some test cases for the recommender (please bear in mind, the recommender is case sensitive!): 'All-Seasons Salt', 'Perfecto' (this was to test the robustness of the model by using a brand name to see if the recommender can return appropriate similarities), 'Sweet Mint Gum' and 'Bubblemint Gum' to show case the performance of the model, and the differences between Count Vectoriser and TF-IDF in predicted values.
recommender_cvm('All-Seasons Salt')

['All-Seasons Salt',
 'All-Seasons Salt',
 'All-Seasons Salt',
 'All-Seasons Salt',
 'Salt Free Seasoning',
 "Nature's Seasons Seasoning Blend",
 "Nature's Seasons Seasoning Blend",
 'All Natural Chives',
 'All Natural Chives',
 'All Natural Chives']

<b>Due to many products with high cardinality in the product_name dimension, let's compare Count Vectoriser with TF-IDF. Since, TF-IDF will penalise cardinality (frequently reccuring terms), it may be useful to compare the two different feature extraction tools and see how they compare.</b>

In [12]:
# Use TF-IDF Vectoriser to compare differences in Cosine Similarity calculation. However, choosing only to use the word analyzer in the TfidfVectorizer() function, as the 'char_wb' analyzer seems to worsen the recommendations.
tfidf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tfidf.fit_transform(features['combined_features'])
tfidf_matrix

<50001x2749 sparse matrix of type '<class 'numpy.float64'>'
	with 335746 stored elements in Compressed Sparse Row format>

In [13]:
# Calculate Cosine Similarity for TF-IDF
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)
cosine_sim_tfidf

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [14]:
# Modify the previous recommendations() function to parse the TD-IDF feature extraction technique to assess differences in feature extraction methods
def recommender_tfidf(product, cosine_sim = cosine_sim_tfidf):
    
    recommended_products = []

    # Grab the index of the product that matches the product_name
    idx = indices[indices == product].index[0]

    # Create a series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)

    # Grab the top 10 most similar products
    top_10_products = list(score_series.iloc[0:10].index)

    # Populate the list with the top 10 recommended products
    for i in top_10_products:
        recommended_products.append(list(features.index)[i])
    
    return recommended_products

In [15]:
recommender_tfidf('All-Seasons Salt')

['All-Seasons Salt',
 'All-Seasons Salt',
 'All-Seasons Salt',
 'All-Seasons Salt',
 "Nature's Seasons Seasoning Blend",
 "Nature's Seasons Seasoning Blend",
 'Salt Free Seasoning',
 'Sauce with salt, spices and pepper Seasoning',
 'Reduced Sodium Salt Alternative',
 'All Natural Chives']