In [177]:
import pandas as pd
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

PARENT_PATH = Path().resolve().parent
DATASET_PATH = PARENT_PATH / 'dataset'

In [178]:
# Load datasets
customer_interactions_df = pd.read_csv(DATASET_PATH / 'customer_interactions.csv')
purchase_history_df = pd.read_csv(DATASET_PATH / 'purchase_history.csv')
product_details_df = pd.read_csv(DATASET_PATH / 'product_details.csv')

In [179]:
# Merge datasets
merged_df = pd.merge(purchase_history_df, product_details_df, on='Product ID')

merged_df.head()

Unnamed: 0,Customer ID,Product ID,Purchase date,Category,Price,Ratings
0,7,6,2024-01-18,Books,370.96,4.5
1,42,6,2024-02-09,Books,370.96,4.5
2,29,6,2024-02-15,Books,370.96,4.5
3,53,6,2024-01-20,Books,370.96,4.5
4,35,6,2024-02-02,Books,370.96,4.5


Now, we will implement a recommendation system using multiple methods, including collaborative filtering and Apriori.

## Collaborative Filtering

In collaborative filtering, two main approaches are commonly used: **user-based and item-based.**

1. **User-based Collaborative Filtering:** This approach recommends items to a target user based on the preferences and behavior of similar users. It identifies users who have similar preferences to the target user and recommends items that those similar users have liked or interacted with. The assumption is that users who have shown similar behavior in the past will likely have similar preferences in the future. User-based collaborative filtering computes similarity scores between users based on their past interactions with items.

2. **Item-based Collaborative Filtering:** In this approach, recommendations are made by identifying items that are similar to the ones that the target user has interacted with in the past. It computes similarity scores between items based on how frequently they are co-rated by users. Then, it recommends items that are most similar to the ones the user has already liked or interacted with. Item-based collaborative filtering is particularly useful when there are more items than users, as it focuses on item similarity rather than user similarity.

Both approaches have their advantages and are used in different scenarios based on factors such as dataset size, sparsity, and computational resources. User-based collaborative filtering is intuitive and easy to understand, while item-based collaborative filtering can be more computationally efficient, especially for large datasets with many items.

### User-Based

First, we will create a user_item_matrix to observe the relationship between each customer and the products they have purchased.

The creation of a user-item matrix involves representing the interaction between users and items in a matrix format, where rows represent users, columns represent items, and the entries of the matrix denote the interaction (e.g., total purchase, rating) between users and items.

In [276]:
# Create user-item matrix
user_item_matrix = merged_df.pivot_table(index='Customer ID', columns='Product ID', values='Ratings', aggfunc='count', fill_value=0)

user_item_matrix.head()

Product ID,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,1,2,1,0,1,0,0,0,1,1,1,1,0,0,1,0,1
2,0,0,0,0,0,1,0,0,0,2,0,2,3,2,0,0,0,0,0,0
3,0,0,1,3,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,1,0,1,1,0,1,1,1,0,1,1,0,0,0,1,0,1,0,0,0
5,0,0,1,3,0,2,1,1,0,2,0,0,1,1,1,0,0,1,0,1


Next, we calculate the similarity between each user using cosine similarity.

Cosine similarity measures the cosine of the angle between two vectors in a multidimensional space, providing a measure of similarity between them. In the context of recommendation systems, cosine similarity is often used to compare the preferences of users based on their interactions with items.

By computing cosine similarity between users, we can identify users with similar preferences and behaviors, allowing us to make recommendations based on the preferences of similar users. This technique forms the basis of user-based collaborative filtering in recommendation systems.

In [296]:
# Calculate cosine similarity between users
user_similarity_matrix = cosine_similarity(user_item_matrix)
user_similarity_matrix

array([[1.        , 0.47304992, 0.44474959, ..., 0.        , 0.47087096,
        0.53846154],
       [0.47304992, 1.        , 0.        , ..., 0.07537784, 0.41367015,
        0.65044364],
       [0.44474959, 0.        , 1.        , ..., 0.18898224, 0.58338335,
        0.07412493],
       ...,
       [0.        , 0.07537784, 0.18898224, ..., 1.        , 0.12862394,
        0.24514517],
       [0.47087096, 0.41367015, 0.58338335, ..., 0.12862394, 1.        ,
        0.13453456],
       [0.53846154, 0.65044364, 0.07412493, ..., 0.24514517, 0.13453456,
        1.        ]])

In [297]:
# Function to recommend products to a user
def recommend_products_user(user_id, num_recommendations=6):
    # Find the index of the user in the user-item matrix
    user_index = user_item_matrix.index.get_loc(user_id)
    
    # Find similar users based on cosine similarity
    similar_users = user_similarity_matrix[user_index]
    similar_users_index = similar_users.argsort()[-7:-1][::-1]  # Extract top 6 similar users
    
    # Find products purchased by similar users but not by the target user
    similar_products = []
    for user_index in similar_users_index:
        similar_product_index = list(user_item_matrix.loc[user_index,:][user_item_matrix.loc[user_index,:] > 0].index)
        similar_products += similar_product_index
    
    # Remove duplicates and aggregate the counts of similar products
    similar_products = list(set(similar_products))
    similar_products_agg = [(product_id, user_item_matrix[product_id].sum()) for product_id in similar_products]
    
    # Sort similar products based on aggregated counts
    sorted_similar_products = sorted(similar_products_agg, key=lambda x: x[1], reverse=True)
    similar_products_sorted = [product_id for (product_id, count) in sorted_similar_products]
    
    # Find products not purchased by the target user
    user_product_index = list(user_item_matrix.loc[user_id,:][user_item_matrix.loc[user_id,:] > 0].index)
    recommended_products = [product_id for product_id in similar_products_sorted if product_id not in user_product_index]
    
    # Return recommended products
    return recommended_products[:num_recommendations]

In [291]:
# Example: Recommend products to a user with ID 1
recommended_products = recommend_products_user(1)

print("Recommended Products for User 1:")
for product_id in recommended_products:
    product_info = product_details_df[product_details_df['Product ID'] == product_id]
    print(product_info[['Product ID', 'Category', 'Price']].values[0])

Recommended Products for User 1:
[10 'Clothing' 466.81]
[3 'Books' 399.18]
[1 'Electronics' 210.32]
[7 'Books' 339.5]
[11 'Clothing' 128.39]
[9 'Home & Kitchen' 340.86]


### Item Based

Now, we create an item_user_matrix, which is the transpose of the user_item_matrix created earlier.

In the user_item_matrix, rows represent users and columns represent items. In contrast, the item_user_matrix has items as rows and users as columns.

This transformation is useful for item-based collaborative filtering, where recommendations are made based on the similarity between items. By transposing the user_item_matrix, we can easily compute the similarity between items using techniques such as cosine similarity.

Once we have the item_user_matrix, we can proceed to calculate the similarity between items using cosine similarity. This will enable us to make item-based recommendations to users based on the similarity between items they have interacted with in the past.

In [196]:
item_user_matrix = user_item_matrix.transpose()
item_user_matrix.head()

Customer ID,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,1,0,0,1,2,...,1,0,0,0,0,1,0,3,0,0
2,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,0,0,1,0,1
3,0,0,1,1,1,0,0,1,0,0,...,0,0,1,1,0,0,0,4,0,1
4,1,0,3,1,3,0,0,1,0,0,...,1,2,0,2,0,1,1,0,2,0
5,2,0,1,0,0,3,0,2,0,2,...,0,0,0,1,0,1,1,0,1,0


In [292]:
# Calculate cosine similarity between users
item_similarity_matrix = cosine_similarity(item_user_matrix)

In [293]:
# Function to recommend products to a user
def recommend_products_item(item_id, num_recommendations=6):
    # Find the index of the item in the item-user matrix
    item_index = item_user_matrix.index.get_loc(item_id)
    
    # Find similar items based on cosine similarity
    similar_items = item_similarity_matrix[item_index]
    similar_items_index = similar_items.argsort()[-7:-1][::-1]  # Extract top 6 similar items
    
    # Recommend similar products
    recommended_products = [item_id for item_id in similar_items_index]
    
    return recommended_products[:num_recommendations]

In [294]:
# Example: Recommend products to a Product with ID 7
recommended_products = recommend_products_item(7)

print(f"Recommended Products for Product 1: {product_details_df[product_details_df['Product ID'] == 1].values[0]}")
for product_id in recommended_products:
    product_info = product_details_df[product_details_df['Product ID'] == product_id]
    print(product_info[['Product ID', 'Category', 'Price']].values[0])

Recommended Products for Product 1: [1 'Electronics' 210.32 4.6]
[17 'Home & Kitchen' 345.22]
[5 'Electronics' 492.34]
[2 'Clothing' 228.36]
[14 'Electronics' 388.34]
[1 'Electronics' 210.32]
[12 'Books' 437.51]


## Apriori

Next, we will explore the use of Apriori algorithm.

The Apriori algorithm is a popular technique used in market basket analysis and association rule mining. It is particularly useful for discovering frequent itemsets in transactional data, where each transaction consists of a set of items.

By applying the Apriori algorithm, we can identify frequent itemsets, which are sets of items that frequently occur together in transactions. These frequent itemsets can then be used to generate association rules, which describe relationships between items.

In the context of recommendation systems, Apriori can be used to discover patterns in user-item interactions, such as frequently co-occurring items in user transactions. These patterns can provide valuable insights into user preferences and behavior, which can be leveraged to make personalized recommendations to users.

In [298]:
# Import libraries for Apriori algorithm and association rule mining
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [259]:
# Load datasets
customer_interactions_df = pd.read_csv(DATASET_PATH / 'customer_interactions.csv')
purchase_history_df = pd.read_csv(DATASET_PATH / 'purchase_history.csv')
product_details_df = pd.read_csv(DATASET_PATH / 'product_details.csv')

In [299]:
# Group purchase history by 'Customer ID' and aggregate 'Product ID' into lists
transactions = purchase_history_df.groupby('Customer ID')['Product ID'].apply(list)

# Convert transaction data into a list of lists
transactions = transactions.to_list()

# Initialize TransactionEncoder to transform transaction data
te = TransactionEncoder()

# Transform transaction data into one-hot encoded format
te_ary = te.fit(transactions).transform(transactions)

# Create DataFrame from transformed transaction data
transactions_df = pd.DataFrame(te_ary, columns=te.columns_)

transactions_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,False,False,False,True,True,True,False,True,False,False,False,True,True,True,True,False,False,True,False,True
1,False,False,False,False,False,True,False,False,False,True,False,True,True,True,False,False,False,False,False,False
2,False,False,True,True,True,False,False,False,False,False,True,False,False,False,False,False,False,True,True,False
3,True,False,True,True,False,True,True,True,False,True,True,False,False,False,True,False,True,False,False,False
4,False,False,True,True,False,True,True,True,False,True,False,False,True,True,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,True,False,False,True,True,True,False,True,True,False,True,True,False,False,False,True,False,True,True,True
96,False,False,False,True,True,False,True,True,True,True,False,True,True,False,False,True,False,True,False,False
97,True,True,True,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False
98,False,False,False,True,True,False,True,False,False,True,True,True,False,True,False,False,False,False,True,False


In [300]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori

# Apply Apriori algorithm to find frequent itemsets
# - `transactions_df`: DataFrame containing transaction data
# - `min_support`: Minimum support threshold for frequent itemsets
# - `use_colnames`: Use original column names as item names in the result DataFrame
frequent_itemsets = apriori(transactions_df, min_support=0.02, use_colnames=True)

# Display frequent itemsets
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.38,(1)
1,0.32,(2)
2,0.41,(3)
3,0.34,(4)
4,0.37,(5)
...,...,...
10342,0.02,"(3, 4, 7, 9, 12, 17, 18, 19, 20)"
10343,0.02,"(3, 4, 8, 9, 11, 12, 15, 16, 20)"
10344,0.02,"(4, 5, 6, 8, 11, 12, 16, 18, 20)"
10345,0.02,"(4, 5, 6, 9, 12, 16, 18, 19, 20)"


In [301]:
# Import association_rules function
from mlxtend.frequent_patterns import association_rules

# Generate association rules from frequent itemsets
# - `frequent_itemsets`: DataFrame containing frequent itemsets
# - `metric`: Metric to use for rule evaluation (e.g., lift)
# - `min_threshold`: Minimum threshold for the metric to generate rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the generated association rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(1),(3),0.38,0.41,0.16,0.421053,1.026958,0.0042,1.019091,0.042339
1,(3),(1),0.41,0.38,0.16,0.390244,1.026958,0.0042,1.016800,0.044492
2,(1),(10),0.38,0.39,0.16,0.421053,1.079622,0.0118,1.053636,0.118952
3,(10),(1),0.39,0.38,0.16,0.410256,1.079622,0.0118,1.051304,0.120902
4,(1),(11),0.38,0.42,0.17,0.447368,1.065163,0.0104,1.049524,0.098672
...,...,...,...,...,...,...,...,...,...,...
348081,(12),"(1, 4, 5, 6, 9, 16, 18, 19, 20)",0.42,0.02,0.02,0.047619,2.380952,0.0116,1.029000,1.000000
348082,(16),"(1, 4, 5, 6, 9, 12, 18, 19, 20)",0.34,0.02,0.02,0.058824,2.941176,0.0132,1.041250,1.000000
348083,(18),"(1, 4, 5, 6, 9, 12, 16, 19, 20)",0.51,0.02,0.02,0.039216,1.960784,0.0098,1.020000,1.000000
348084,(19),"(1, 4, 5, 6, 9, 12, 16, 18, 20)",0.35,0.02,0.02,0.057143,2.857143,0.0130,1.039394,1.000000


In [302]:
# Function to recommend products based on a given product
def recommend_products_apriori(product_id, num_recommendations=6):
    # Find association rules related to the given product ID
    related_rules = rules[rules['antecedents'] == frozenset({product_id})]
    
    # Sort related rules by lift in descending order
    related_rules_sorted = related_rules.sort_values(by='lift', ascending=False)
    
    # Extract consequent items from the sorted rules
    consequent_items = related_rules_sorted['consequents'].apply(lambda x: list(x))
    
    # Initialize recommended products list
    recommended_products = []
    
    # Extract recommended products from consequent items
    for consequent_item_list in consequent_items:
        for consequent_item in consequent_item_list:
            if consequent_item not in recommended_products:
                recommended_products.append(consequent_item)
            if len(recommended_products) >= num_recommendations:
                break
        
    # Return recommended products
    return recommended_products[:num_recommendations]

In [303]:
# Example: Recommend products related to Product ID 10
product_id_sample = 10
product_name_sample = product_details_df[product_details_df['Product ID'] == product_id_sample]['Category'].values[0]
recommended_products = recommend_products_apriori(product_id_sample)

print(f"Recommended Products based on Product ID {product_id_sample} ({product_name_sample}):")
for product_id in recommended_products:
    product_info = product_details_df[product_details_df['Product ID'] == product_id]
    print(product_info[['Product ID', 'Category', 'Price']].values)

Recommended Products based on Product ID 10 (Clothing):
[[16 'Books' 398.97]]
[[1 'Electronics' 210.32]]
[[7 'Books' 339.5]]
[[2 'Clothing' 228.36]]
[[19 'Home & Kitchen' 276.07]]
[[14 'Electronics' 388.34]]


In [305]:
import joblib 

# Save the rules DataFrame to a file
joblib.dump(rules, PARENT_PATH / 'rules.joblib')

['C:\\Users\\Nusatrip\\Repo\\skilvul-technical-test\\rules.joblib']