Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import itertools

Read datasets

In [6]:
transaction = pd.read_csv('../Datasets/sales_transaction_items.csv')
product = pd.read_csv('../Datasets/products.csv')

Calculate Lift (Apriori Association)

In [3]:
# Group products by transaction_id to create baskets
baskets = transaction.groupby('transaction_id')['product_id'].apply(set)

# Get all unique products
products = set(transaction['product_id'])

# Calculate total number of transactions
num_transactions = len(baskets)

# Precompute support for each product
product_support = {p: sum([p in basket for basket in baskets]) / num_transactions for p in products}

# Calculate lift for each product pair
lift_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        # Support for both products together
        support_both = sum([(p1 in basket) and (p2 in basket) for basket in baskets]) / num_transactions
        # Calculate lift
        if product_support[p1] > 0 and product_support[p2] > 0:
            lift = support_both / (product_support[p1] * product_support[p2])
            lift_results.append((p1, p2, lift))
        else:
            lift_results.append((p1, p2, -1))

# Output results in the requested format
for r in lift_results:
    print(f'{r[0]},{r[1]},{r[2]}')

P1030,P1105,0.0
P1030,P1162,8.928571428571429
P1030,P1307,0.0
P1030,P1110,5.4945054945054945
P1030,P1230,0.0
P1030,P1073,0.0
P1030,P1252,0.0
P1030,P1114,0.0
P1030,P1069,0.0
P1030,P1225,6.211180124223603
P1030,P1111,5.4945054945054945
P1030,P1054,0.0
P1030,P1180,0.0
P1030,P1166,0.0
P1030,P1212,0.0
P1030,P1223,0.0
P1030,P1065,0.0
P1030,P1157,0.0
P1030,P1063,0.0
P1030,P1158,0.0
P1030,P1101,0.0
P1030,P1104,0.0
P1030,P1178,0.0
P1030,P1107,0.0
P1030,P1265,0.0
P1030,P1039,0.0
P1030,P1098,0.0
P1030,P1242,5.714285714285714
P1030,P1289,0.0
P1030,P1358,0.0
P1030,P1137,0.0
P1030,P1235,0.0
P1030,P1087,0.0
P1030,P1125,0.0
P1030,P1300,0.0
P1030,P1245,0.0
P1030,P1093,0.0
P1030,P1331,0.0
P1030,P1060,5.4945054945054945
P1030,P1318,0.0
P1030,P1174,0.0
P1030,P1213,5.291005291005291
P1030,P1103,0.0
P1030,P1134,0.0
P1030,P1077,4.926108374384237
P1030,P1122,0.0
P1030,P1342,0.0
P1030,P1325,0.0
P1030,P1096,0.0
P1030,P1257,0.0
P1030,P1321,0.0
P1030,P1293,0.0
P1030,P1294,0.0
P1030,P1200,0.0
P1030,P1356,0.0
P1030

In [5]:
# Calculate Yule's Q for each product pair
yule_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        n11 = sum([(p1 in basket) and (p2 in basket) for basket in baskets])
        n10 = sum([(p1 in basket) and (p2 not in basket) for basket in baskets])
        n01 = sum([(p1 not in basket) and (p2 in basket) for basket in baskets])
        n00 = sum([(p1 not in basket) and (p2 not in basket) for basket in baskets])
        denominator = n11 * n00 + n10 * n01
        if denominator == 0:
            q = None
        else:
            q = (n11 * n00 - n10 * n01) / denominator
        yule_results.append((p1, p2, q))

# Output results in the requested format
for r in yule_results:
    print(f'{r[0]},{r[1]},{r[2]}')

P1030,P1105,-1.0
P1030,P1162,0.8237476808905381
P1030,P1307,-1.0
P1030,P1110,0.7104806022003475
P1030,P1230,-1.0
P1030,P1073,-1.0
P1030,P1252,-1.0
P1030,P1114,-1.0
P1030,P1069,-1.0
P1030,P1225,0.7409478952016485
P1030,P1111,0.7104806022003475
P1030,P1054,-1.0
P1030,P1180,-1.0
P1030,P1166,-1.0
P1030,P1212,-1.0
P1030,P1223,-1.0
P1030,P1065,-1.0
P1030,P1157,-1.0
P1030,P1063,-1.0
P1030,P1158,-1.0
P1030,P1101,-1.0
P1030,P1104,-1.0
P1030,P1178,-1.0
P1030,P1107,-1.0
P1030,P1265,-1.0
P1030,P1039,-1.0
P1030,P1098,-1.0
P1030,P1242,0.7205240174672489
P1030,P1289,-1.0
P1030,P1358,-1.0
P1030,P1137,-1.0
P1030,P1235,-1.0
P1030,P1087,-1.0
P1030,P1125,-1.0
P1030,P1300,-1.0
P1030,P1245,-1.0
P1030,P1093,-1.0
P1030,P1331,-1.0
P1030,P1060,0.7104806022003475
P1030,P1318,-1.0
P1030,P1174,-1.0
P1030,P1213,0.7005470774546502
P1030,P1103,-1.0
P1030,P1134,-1.0
P1030,P1077,0.6810025633722586
P1030,P1122,-1.0
P1030,P1342,-1.0
P1030,P1325,-1.0
P1030,P1096,-1.0
P1030,P1257,-1.0
P1030,P1321,-1.0
P1030,P1293,-1.0
P103

Combine the results

In [15]:
# Combine lift_results and yule_results into a DataFrame and save as ProductTransactionScores.csv
combined = []
for (p1, p2, lift), (_, _, yuleq) in zip(lift_results, yule_results):
    lift = None if lift == -1 else lift
    combined.append({'ProductID1': p1, 'ProductID2': p2, 'Lift': lift, 'YuleQ': yuleq})

df_combined = pd.DataFrame(combined)
df_combined.to_csv('../Datasets/ProductTransactionScores.csv', index=False)
print(df_combined.head())

  ProductID1 ProductID2      Lift     YuleQ
0      P1030      P1105  0.000000 -1.000000
1      P1030      P1162  8.928571  0.823748
2      P1030      P1307  0.000000 -1.000000
3      P1030      P1110  5.494505  0.710481
4      P1030      P1230  0.000000 -1.000000
