### Key concepts:
    - Support: metric that measures how frequently an item appears 
               in the dataset relative to the total number of observations
    - Confidence: assesses the likelihood that an item Y is purchased when 
                  item X is purchased.
    - Lift: evaluates how much more likely two items are to be purchased 
            together compared to being purchased independently

In [39]:
import pandas as pd
from typing import List

In [10]:
MINIMUM_SUPPORT_THRESHOLD = 0.5 # How often the item has to appear in all observations
MINIMUM_CONFIDENCE_THRESHOLD = 0.7 # Defined as: Support(A | B) / Support(A)

In [14]:
df = pd.read_csv("algos/apriori/dummy_dataset.csv")

In [15]:
df.head()

Unnamed: 0,Transaction ID,Items
0,T1,Bread;Butter;Milk
1,T2,Bread;Butter
2,T3,Bread;Milk
3,T4,Butter;Milk
4,T5,Bread;Milk


### Dataset preprocessing

In [33]:
unique_items = []
for i, row in df.iterrows():
    items = row["Items"].split(";")
    for item in items:
        if item not in unique_items:
            unique_items.append(item)

for item_name in unique_items:
    df[item_name] = df["Items"].apply(lambda x: 1 if item_name in x else 0)

df = df.drop("Items", axis=1)

In [34]:
df.head()

Unnamed: 0,Transaction ID,Bread,Butter,Milk
0,T1,1,1,1
1,T2,1,1,0
2,T3,1,0,1
3,T4,0,1,1
4,T5,1,0,1


### Apriori functions

In [49]:
def support(df: pd.DataFrame, col_names: List[str]) -> float:
    # Check if all columns are present in dataframe
    for col_name in col_names:
        if col_name not in df.columns:
            raise KeyError(f"Column name: {col_name} is not present in dataframe")
    return df[col_names].all(axis=1).sum() / len(df)


# Defined as P_hat(col_name_2 | col_name_1) = Supp(col_name_1 | col_name_2) / Supp(col_name_1)
def confidence(df: pd.DataFrame, col_name_1: str, col_name_2: str) -> float:
    return support(df, [col_name_1, col_name_2]) / support(df, col_name_1)


# Defined as P_hat(col_name_2 | col_name_1) / P_hat(col_name_2) = Supp(col_name_2) / (1 - confidence(col_name_1, col_name_2))
def lift(df: pd.DataFrame, col_name_1: str, col_name_2: str) -> float:
    return support(col_name_2) / (1 - confidence(col_name_1, col_name_2))

In [31]:
def get_combinations(item_list, depth, cur_list=[], cur_index=-1):
    if depth == 0:
        return [cur_list]
    
    results = []
    for i, item in enumerate(item_list[cur_index+1:]):
        if len(item_list[cur_index+i+1:]) < depth:
            return results
        results += get_combinations(item_list, depth-1, cur_list+[item], cur_index+i+1)

    return results

In [39]:
depth = 1
frequent_itemsets = []
while True:
    level_combinations = get_combinations(unique_items, depth)
    if len(level_combinations) == 0:
        break
    for combination in level_combinations:
        if support(df, combination) > MINIMUM_SUPPORT_THRESHOLD and len(combination) > 1:
            frequent_itemsets.append(combination)
    depth += 1

In [40]:
frequent_itemsets

[['Bread', 'Milk']]