In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from pathlib import Path
import numpy as np
from itertools import permutations, combinations
import time
import math
from typing import Iterable

# Load the dataset
notebook_dir = Path().resolve()
movilens = pd.read_csv( notebook_dir / 'movilens_dataset/movies.csv')

# Transform the dataset
movilens['genres'] = movilens['genres'].str.split('|')

# Remove the '(no genres listed)' genre
movilens = movilens[movilens['genres'].apply(lambda x: '(no genres listed)' not in x)]
te = TransactionEncoder()
te_ary = te.fit(movilens['genres']).transform(movilens['genres'])
movilens = pd.DataFrame(te_ary, columns=te.columns_).set_index(movilens['title'])

Calcular el soporte, confianza y lift de las siguientes reglas:  
- `Romance -> Drama`
- `Action, Adventure -> Thriller`
- `Crime, Action -> Thriller` 
- `Crime -> Action, Thriller`
- `Crime -> Children's`

In [2]:
# Supports
print("Confidences")
support_Ro_Dr = np.logical_and(movilens['Romance'], movilens['Drama']).mean()
print(f"Suppport (Romance -> Drama): {support_Ro_Dr}")
support_Ac_Ad_Th = np.logical_and(np.logical_and(movilens['Action'], movilens['Adventure']), movilens['Thriller']).mean()
print(f"Suppport (Action, Adventure -> Thriller): {support_Ac_Ad_Th}")
support_Cr_Ac_Th = np.logical_and(np.logical_and(movilens['Crime'], movilens['Action']), movilens['Thriller']).mean()
print(f"Suppport (Crime, Action -> Thriller): {support_Cr_Ac_Th}")
support_Cr_Ch = np.logical_and(movilens['Crime'], movilens['Children']).mean()
print(f"Suppport (Crime -> Children): {support_Cr_Ch}")

# Confidences
print("\nConfidences")
confidence_Ro_Dr = support_Ro_Dr / movilens['Romance'].mean()
print(f"Confidence (Romance -> Drama): {confidence_Ro_Dr}")
confidence_AcAd_Th = support_Ac_Ad_Th / np.logical_and(movilens['Action'], movilens['Adventure']).mean()
print(f"Confidence (Action, Adventure -> Thriller): {confidence_AcAd_Th}")
confidence_CrAc_Th = support_Cr_Ac_Th / np.logical_and(movilens['Crime'], movilens['Action']).mean()
print(f"Confidence (Crime, Action -> Thriller): {confidence_CrAc_Th}")
confidence_Cr_AcTh = support_Cr_Ac_Th / movilens['Crime'].mean()
print(f"Confidence (Crime -> Action, Thriller): {confidence_Cr_AcTh}")
confidence_Cr_Ch = support_Cr_Ch / movilens['Crime'].mean()
print(f"Confidence (Crime -> Children): {confidence_Cr_Ch}")

# Lifts
print("\nLifts")
lift_Ro_Dr = confidence_Ro_Dr / movilens['Drama'].mean()
print(f"Lift (Romance -> Drama): {lift_Ro_Dr}")
lift_AcAd_Th = confidence_AcAd_Th / movilens['Thriller'].mean()
print(f"Lift (Action, Adventure -> Thriller): {lift_AcAd_Th}")
lift_CrAc_Th = confidence_CrAc_Th / movilens['Thriller'].mean()
print(f"Lift (Crime, Action -> Thriller): {lift_CrAc_Th}")
lift_Cr_AcTh = confidence_Cr_AcTh / np.logical_and(movilens['Action'], movilens['Thriller']).mean()
print(f"Lift (Crime -> Action, Thriller): {lift_Cr_AcTh}")
lift_Cr_Ch = confidence_Cr_Ch / movilens['Children'].mean()
print(f"Lift (Crime -> Children): {lift_Cr_Ch}")

Confidences
Suppport (Romance -> Drama): 0.07382150177007639
Suppport (Action, Adventure -> Thriller): 0.004732625302776225
Suppport (Crime, Action -> Thriller): 0.010471399291969443
Suppport (Crime -> Children): 0.0008198248555990311

Confidences
Confidence (Romance -> Drama): 0.5731507377760632
Confidence (Action, Adventure -> Thriller): 0.1814285714285714
Confidence (Crime, Action -> Thriller): 0.4596510359869138
Confidence (Crime -> Action, Thriller): 0.12084288990825688
Confidence (Crime -> Children): 0.009461009174311925

Lifts
Lift (Romance -> Drama): 1.3501536253010085
Lift (Action, Adventure -> Thriller): 1.2353807953021347
Lift (Crime, Action -> Thriller): 3.1298491628289344
Lift (Crime -> Action, Thriller): 3.413493632303235
Lift (Crime -> Children): 0.1685085273402614


In [3]:
def _get_support(itemset: list, onehot_dataset: pd.DataFrame) -> float:
    logic_and_result = onehot_dataset[itemset[0]]
    for item in itemset[1:]:
        logic_and_result = np.logical_and(logic_and_result, onehot_dataset[item])
    return logic_and_result.mean()

In [4]:
def _rule_metrics(antedecent: list, consequent: list, onehot_dataset: pd.DataFrame) -> dict[str, float | str]:
    itemset = antedecent + consequent
    support = _get_support(itemset=itemset, onehot_dataset=onehot_dataset)
    confidence = support / _get_support(itemset=antedecent, onehot_dataset=onehot_dataset)
    lift = confidence / _get_support(itemset=consequent, onehot_dataset=onehot_dataset)
    
    metrics = {
        'rule': f"{antedecent} -> {consequent}",
        'support': support,
        'confidence': confidence,
        'lift': lift
    }
    return metrics

In [5]:
def get_rules_metrics_df(rules: list[tuple[list[str], list[str]]], onehot_dataset: pd.DataFrame) -> pd.DataFrame:
    rules_metrics = {
        'rule': [],
        'support': [],
        'confidence': [],
        'lift': []
    }
    for rule in rules:
        rule_metric = _rule_metrics(rule[0], rule[1], onehot_dataset)
        rules_metrics['rule'].append(rule_metric['rule'])
        rules_metrics['support'].append(rule_metric['support'])
        rules_metrics['confidence'].append(rule_metric['confidence'])
        rules_metrics['lift'].append(rule_metric['lift'])
    return pd.DataFrame(rules_metrics)

In [6]:
def get_number_of_rules(n_itemset: int, n_subitemset: int) -> int:
    result = 0
    for n_ant in range(1, n_subitemset):
        result += math.comb(n_itemset, n_subitemset)*math.comb(n_subitemset, n_ant)
    return result

def get_rules(itemset: Iterable[str], n_subitemset: int) -> list[tuple[list[str], list[str]]]:
    rules = []
    for subitemset in list(combinations(itemset, n_subitemset)):
        for i in range(1, len(subitemset)):
            for antecedent in combinations(subitemset, i):
                remaining = set(subitemset) - set(antecedent)
                rules.append((list(antecedent), list(remaining)))
    return rules

Calcular matemáticamente y de manera justificada cuantas reglas de tipo A -> B se pueden construir para este dataset.

In [7]:
n_subitemset = 2
print(f"Number of rules for {n_subitemset} itemset: {get_number_of_rules(n_itemset=len(movilens.columns), n_subitemset=n_subitemset)}")

Number of rules for 2 itemset: 342


In [8]:
start = time.time()
rules = get_rules(movilens.columns, n_subitemset=2)
end = time.time()
rules_metrics_df = get_rules_metrics_df(rules, movilens)
rules_metrics_df = rules_metrics_df.sort_values(by=['support', 'confidence', 'lift'], ascending=[False, False, False])

print(f"Time elapsed: {end - start} segs. ")
rules_metrics_df.head(20)

Time elapsed: 0.00030112266540527344 segs. 


Unnamed: 0,rule,support,confidence,lift
136,['Comedy'] -> ['Drama'],0.075536,0.262974,0.619479
137,['Drama'] -> ['Comedy'],0.075536,0.177937,0.619479
223,['Romance'] -> ['Drama'],0.073822,0.573151,1.350154
222,['Drama'] -> ['Romance'],0.073822,0.173899,1.350154
151,['Romance'] -> ['Comedy'],0.057164,0.443823,1.545146
150,['Comedy'] -> ['Romance'],0.057164,0.199014,1.545146
227,['Thriller'] -> ['Drama'],0.056705,0.386112,0.909552
226,['Drama'] -> ['Thriller'],0.056705,0.133577,0.909552
162,['Crime'] -> ['Drama'],0.04673,0.539278,1.27036
163,['Drama'] -> ['Crime'],0.04673,0.11008,1.27036


Calcular matemáticamente y de manera justificada cuantas reglas de tipo A,B -> C ó A -> B,C se pueden construir para este dataset.

In [9]:
n_subitemset = 3
print(f"Number of rules for {n_subitemset} itemset: {get_number_of_rules(n_itemset=len(movilens.columns), n_subitemset=n_subitemset)}")

Number of rules for 3 itemset: 5814


In [10]:
start = time.time()
rules = get_rules(movilens.columns, n_subitemset=3)
end = time.time()
rules_metrics_df = get_rules_metrics_df(rules, movilens)
rules_metrics_df = rules_metrics_df.sort_values(by=['support', 'confidence', 'lift'], ascending=[False, False, False])

print(f"Time elapsed: {end - start} segs. ")
rules_metrics_df.head(20)

  lift = confidence / _get_support(itemset=consequent, onehot_dataset=onehot_dataset)
  confidence = support / _get_support(itemset=antedecent, onehot_dataset=onehot_dataset)


Time elapsed: 0.01947784423828125 segs. 


Unnamed: 0,rule,support,confidence,lift
3274,"['Comedy', 'Romance'] -> ['Drama']",0.020073,0.351152,0.827197
3275,"['Drama', 'Romance'] -> ['Comedy']",0.020073,0.271917,0.946663
3273,"['Comedy', 'Drama'] -> ['Romance']",0.020073,0.265746,2.063252
3272,"['Romance'] -> ['Comedy', 'Drama']",0.020073,0.155849,2.063252
3270,"['Comedy'] -> ['Romance', 'Drama']",0.020073,0.069884,0.946663
3271,"['Drama'] -> ['Romance', 'Comedy']",0.020073,0.047286,0.827197
3754,"['Crime', 'Thriller'] -> ['Drama']",0.016819,0.492901,1.161113
3753,"['Crime', 'Drama'] -> ['Thriller']",0.016819,0.359915,2.450728
3755,"['Drama', 'Thriller'] -> ['Crime']",0.016819,0.296605,3.4229
3750,"['Crime'] -> ['Thriller', 'Drama']",0.016819,0.194094,3.4229


Calcular matemáticamente y de manera justificada cuantas reglas de 9 elementos se pueden construir para este dataset.

In [11]:
n_subitemset = 9
print(f"Number of rules for {n_subitemset} itemset: {get_number_of_rules(n_itemset=len(movilens.columns), n_subitemset=n_subitemset)}")

Number of rules for 9 itemset: 47112780


In [12]:
start = time.time()
rules = get_rules(movilens.columns, n_subitemset=9)
end = time.time()
print(f"Time elapsed: {end - start} segs. ")

Time elapsed: 243.00370693206787 segs. 


Calcular matemáticamente y de manera justificada cuantas reglas de todo tipo que contengan desde 1 hasta 19 elementos (todas las reglas posibles) se pueden construir para este dataset. Siguiendo el enfoque anterior ¿cuánto tiempo tardaría vuestro código en generarlas?

In [16]:
n_subitemset = len(movilens.columns)
print(f"Number of rules for {n_subitemset} itemset: {get_number_of_rules(n_itemset=len(movilens.columns), n_subitemset=n_subitemset)}")

Number of rules for 19 itemset: 524286


In [17]:
start = time.time()
rules = get_rules(movilens.columns, n_subitemset=len(movilens.columns))
end = time.time()
print(f"Time elapsed: {end - start} segs. ")

Time elapsed: 0.9390888214111328 segs. 
