# MDLE: Assignment 1
## A-Priori algorithm

In [1]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [2]:
INPUT_FILE_PATH = "data/conditions.parquet"
RESULTS_DIRECTORY_PATH = "data/"

MIN_SUPPORT_THRESHOLD = 1000

In [3]:
spark = SparkSession.builder.appName("A-Priori").getOrCreate()
sc = spark.sparkContext

24/04/03 21:36:17 WARN Utils: Your hostname, omen resolves to a loopback address: 127.0.1.1; using 192.168.1.122 instead (on interface wlo1)
24/04/03 21:36:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 21:36:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = spark.read \
    .format("parquet") \
    .option("compression", "gzip") \
    .load("data/conditions.parquet")

baskets = data.rdd

data.show()

                                                                                

+--------------------+--------------------+
|             PATIENT|          CONDITIONS|
+--------------------+--------------------+
|0000055d-e9a9-4f6...|[65966004, 10509002]|
|0000e9ce-2e20-4c2...|[65966004, 161140...|
|0000fc30-1096-40b...|[271737000, 59621...|
|0001b288-1320-470...|[162864005, 72892...|
|000246a4-c6f5-480...|[65363002, 284549...|
|0003a636-b172-48c...|[196416002, 62106...|
|0006d39d-364a-46a...|[428251008, 59621...|
|0007a215-694b-428...|[162864005, 72892...|
|00085029-7bdd-467...|[271737000, 53741...|
|0008dd63-85c3-47b...|[58150001, 271737...|
|0008ed08-1899-444...|[162864005, 10509...|
|00091bb6-7352-43b...|[53741008, 105090...|
|000a949e-82d6-441...|[271737000, 44481...|
|000b05e4-c63c-40c...|[162864005, 27173...|
|000c905e-46d1-4d4...|[43878008, 368581...|
|000da8dd-2917-4bd...|[271737000, 59621...|
|000e6ebf-8ad3-430...|[15777000, 558220...|
|000eb281-9fa1-446...|[162864005, 72892...|
|00106d6a-f7b9-455...|[162864005, 36971...|
|0011b210-c80b-4ed...|[241929008

In [5]:
from itertools import combinations
from time import perf_counter

def a_priori(baskets_rdd, min_support_threshold, max_k):
    """
        Compute frequent itemsets using the A-Priori algorithm.

        args:
            baskets_rdd: RDD of baskets (lists of items).
            min_support_threshold: minimum count of an itemset to be considered frequent.
            max_k: maximum size of the itemsets to compute.
    """

    # First pass: compute frequent itemsets of size 1
    t0 = perf_counter()

    frequent_k_items = baskets_rdd.flatMap(
        lambda basket: [(item, 1) for item in basket.CONDITIONS]
    ) \
    .reduceByKey(lambda a, b: a + b) \
    .filter(lambda entry: entry[1] >= min_support_threshold) \
    .map(lambda entry: entry[0]) \
    .collect()

    frequent_k_items = sc.broadcast(set(frequent_k_items))

    print(f"Computed frequent itemsets of size 1 in {perf_counter() - t0} seconds")

    if max_k == 1:
        return frequent_k_items.value

    k = sc.broadcast(2)

    while True:
        if k.value == 2: # k = 2
            build_candidates = lambda basket: [((item1, item2), 1) for item1 in basket.CONDITIONS for item2 in basket.CONDITIONS
            if item1 < item2 and item1 in frequent_k_items.value and item2 in frequent_k_items.value]
        else: # k > 2
            build_candidates = lambda basket: [
                (itemset, 1) for itemset in combinations(sorted(basket.CONDITIONS), k.value)
                if all(subset in frequent_k_items.value for subset in combinations(itemset, k.value - 1))
            ]

        t0 = perf_counter()
        frequent_k_items = baskets_rdd.flatMap(build_candidates) \
        .reduceByKey(lambda a, b: a + b) \
        .filter(lambda entry: entry[1] >= min_support_threshold) \
        
        if k.value == max_k:
            res = frequent_k_items.sortBy(lambda entry: entry[1], ascending=False).collect()
            print(f"Computed frequent itemsets of size {k.value} in {perf_counter() - t0} seconds")
            return res

        frequent_k_items = sc.broadcast(set(frequent_k_items.map(lambda entry: entry[0]).collect()))
        print(f"Computed frequent itemsets of size {k.value} in {perf_counter() - t0} seconds")

        k = sc.broadcast(k.value + 1)

### K = 2

In [6]:
frequent_2_itemsets = a_priori(baskets, MIN_SUPPORT_THRESHOLD, 2)
frequent_2_itemsets[:10]

                                                                                

Computed frequent itemsets of size 1 in 10.10022094699525 seconds


                                                                                

Computed frequent itemsets of size 2 in 15.236589359999925 seconds


[(('195662009', '444814009'), 343651),
 (('10509002', '444814009'), 302516),
 (('15777000', '271737000'), 289176),
 (('162864005', '444814009'), 243812),
 (('271737000', '444814009'), 236847),
 (('15777000', '444814009'), 236320),
 (('10509002', '195662009'), 211065),
 (('444814009', '59621000'), 203450),
 (('162864005', '195662009'), 167438),
 (('40055000', '444814009'), 165530)]

### K = 3

In [7]:
frequent_3_itemsets = a_priori(baskets, MIN_SUPPORT_THRESHOLD, 3)
frequent_3_itemsets[:10]

                                                                                

Computed frequent itemsets of size 1 in 6.687722422000661 seconds


                                                                                

Computed frequent itemsets of size 2 in 14.630646610996337 seconds




Computed frequent itemsets of size 3 in 41.26767710100103 seconds


                                                                                

[(('15777000', '271737000', '444814009'), 192819),
 (('10509002', '195662009', '444814009'), 139174),
 (('15777000', '195662009', '271737000'), 132583),
 (('10509002', '15777000', '271737000'), 115510),
 (('162864005', '195662009', '444814009'), 111860),
 (('195662009', '271737000', '444814009'), 108560),
 (('15777000', '195662009', '444814009'), 108083),
 (('15777000', '271737000', '59621000'), 99818),
 (('10509002', '162864005', '444814009'), 97384),
 (('10509002', '271737000', '444814009'), 94793)]

### Saving results

In [8]:
import pickle

with open(RESULTS_DIRECTORY_PATH + "frequent_2_itemsets.pkl", "wb") as f:
    pickle.dump(frequent_2_itemsets, f)

with open(RESULTS_DIRECTORY_PATH + "frequent_3_itemsets.pkl", "wb") as f:
    pickle.dump(frequent_3_itemsets, f)

## Rule generation

In [9]:
n_total = baskets.count()
n_total

                                                                                

1157578

In [10]:
def metrics (support1, support2, support_union):
    p1 = support1 / n_total
    p2 = support2 / n_total

    confidence = support_union / support1
    interest = confidence - p2
    lift = confidence / p2
    
    x = max(p1 + p2 - 1, 1/n_total) / (p1 * p2)
    standardised_lift = (lift - x) / ((1/max(p1, p2)) - x)

    return {
        "confidence": confidence,
        "interest": interest,
        "lift": lift,
        "standardised_lift": standardised_lift
    }

In [11]:
frequent_1_itemsets = baskets.flatMap(
    lambda basket: [(item, 1) for item in basket.CONDITIONS]
) \
.reduceByKey(lambda a, b: a + b) \
.filter(lambda entry: entry[1] >= MIN_SUPPORT_THRESHOLD) \
.collect()

                                                                                

In [34]:
frequent_1_itemsets_dict = dict(frequent_1_itemsets)
rules = []

### K = 2

In [35]:
# Relations X -> Y with a standardised lift > 0.2
for (item1, item2), support_union in frequent_2_itemsets:
    support1 = frequent_1_itemsets_dict[item1]
    support2 = frequent_1_itemsets_dict[item2]
    
    # evaluate item1 -> item2 (I = {item1}, j = {item2})
    metrics1 = metrics(support1, support2, support_union)

    if metrics1["standardised_lift"] >= 0.2:
        rules.append({"antecedent": int(item1), "consequent": int(item2), **metrics1})

    # evaluate item2 -> item1 (I = {item2}, j = {item1})
    metrics2 = metrics(support2, support1, support_union)
    if metrics2["standardised_lift"] >= 0.2:
        rules.append({"antecedent": int(item2), "consequent": int(item1), **metrics2})

### K = 3

In [18]:
frequent_2_itemsets_dict = dict(frequent_2_itemsets)

In [36]:
# Relations (X, Y) -> Z with a standardised lift > 0.2
for itemset, support_union in frequent_3_itemsets:
    for j in itemset:
        I = tuple(item for item in itemset if item != j)

        support1 = frequent_2_itemsets_dict[I]
        support2 = frequent_1_itemsets_dict[j]

        metrics_ = metrics(support1, support2, support_union)

        if metrics_["standardised_lift"] >= 0.2:
            rules.append({"antecedent": tuple(sorted(I, key=lambda x: int(x))), "consequent": int(j), **metrics_})

In [20]:
len(rules)

25665

### Exporting results

In [21]:
import pandas as pd

rules_df = pd.DataFrame(rules)
rules_df.sort_values("standardised_lift", ascending=False, inplace=True)

rules_df.to_string(RESULTS_DIRECTORY_PATH + "association_rules.txt", index=False, float_format=lambda x: f"{x:.15f}")