In [1]:
import numpy as np
import pandas as pd


def generate_new_combinations(old_combinations):
    """
    Generator of all combinations based on the last state of Apriori algorithm
    Parameters
    -----------
    old_combinations: np.array
        All combinations with enough support in the last step
        Combinations are represented by a matrix.
        Number of columns is equal to the combination size
        of the previous step.
        Each row represents one combination
        and contains item type ids in the ascending order
        ```
               0        1
        0      15       20
        1      15       22
        2      17       19
        ```
    Returns
    -----------
    Generator of all combinations from the last step x items
    from the previous step.
    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori
    """

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item


def apriori(
        df, min_support=0.5, use_colnames=True, max_len=None, verbose=0, low_memory=False
):
    """Get frequent itemsets from a one-hot DataFrame
    Parameters
    -----------
    df : pandas DataFrame
      pandas DataFrame the encoded format. Also supports
      DataFrames with sparse data; for more info, please
      see (https://pandas.pydata.org/pandas-docs/stable/
           user_guide/sparse.html#sparse-data-structures)
      Please note that the old pandas SparseDataFrame format
      is no longer supported in mlxtend >= 0.17.2.
      The allowed values are either 0/1 or True/False.
      For example,
    ```
             Apple  Bananas   Beer  Chicken   Milk   Rice
        0     True    False   True     True  False   True
        1     True    False   True    False  False   True
        2     True    False   True    False  False  False
        3     True     True  False    False  False  False
        4    False    False   True     True   True   True
        5    False    False   True    False   True   True
        6    False    False   True    False   True  False
        7     True     True  False    False  False  False
    ```
    min_support : float (default: 0.5)
      A float between 0 and 1 for minumum support of the itemsets returned.
      The support is computed as the fraction
      `transactions_where_item(s)_occur / total_transactions`.
    use_colnames : bool (default: False)
      If `True`, uses the DataFrames' column names in the returned DataFrame
      instead of column indices.
    max_len : int (default: None)
      Maximum length of the itemsets generated. If `None` (default) all
      possible itemsets lengths (under the apriori condition) are evaluated.
    verbose : int (default: 0)
      Shows the number of iterations if >= 1 and `low_memory` is `True`. If
      >=1 and `low_memory` is `False`, shows the number of combinations.
    low_memory : bool (default: False)
      If `True`, uses an iterator to search for combinations above
      `min_support`.
      Note that while `low_memory=True` should only be used for large dataset
      if memory resources are limited, because this implementation is approx.
      3-6x slower than the default.
    Returns
    -----------
    pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
      that are >= `min_support` and < than `max_len`
      (if `max_len` is not None).
      Each itemset in the 'itemsets' column is of type `frozenset`,
      which is a Python built-in type that behaves similarly to
      sets except that it is immutable
      (For more info, see
      https://docs.python.org/3.6/library/stdtypes.html#frozenset).
    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
    """

    def _support(_x, _n_rows):
        """DRY private method to calculate support as the
        row-wise sum of values / number of rows
        Parameters
        -----------
        _x : matrix of bools or binary
        _n_rows : numeric, number of rows in _x
        _is_sparse : bool True if _x is sparse
        Returns
        -----------
        np.array, shape = (n_rows, )
        Examples
        -----------
        For usage examples, please see
        http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
        """
        out = np.sum(_x, axis=0) / _n_rows
        return np.array(out).reshape(-1)

    if min_support <= 0.0:
        raise ValueError(
            "`min_support` must be a positive "
            "number within the interval `(0, 1]`. "
            "Got %s." % min_support
        )
    X = df.values
    support = _support(X, X.shape[0])
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float("inf")):
        next_max_itemset = max_itemset + 1

        # With exceptionally large datasets, the matrix operations can use a
        # substantial amount of memory. For low memory applications or large
        # datasets, set `low_memory=True` to use a slower but more memory-
        # efficient implementation.

        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)

        if combin.size == 0:
            break
        if verbose:
            print(
                "\rProcessing %d combinations | Sampling itemset size %d"
                % (combin.size, next_max_itemset),
                end="",
            )

        _bools = np.all(X[:, combin], axis=2)
        support = _support(np.array(_bools), rows_count)
        _mask = (support >= min_support).reshape(-1)
        if any(_mask):
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            # Exit condition
            break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ["support", "itemsets"]
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df["itemsets"] = res_df["itemsets"].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used

    return res_df

In [29]:
from itertools import chain, combinations


def subsets(arr):
    """ Returns non empty subsets of arr"""
    return list(chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)]))

In [37]:
df = pd.read_csv('../data/movies.csv')

In [44]:
df_head = df.head(10)
example = df_head[list(df.columns)[0:5]]
example.to_excel('../data/example.xlsx', index=False)

In [49]:
example_df = pd.read_excel('../data/example.xlsx')

In [50]:
example_df

Unnamed: 0,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter
0,1,0,1,0,1
1,0,0,0,0,1
2,0,1,0,1,0
3,1,0,1,0,1
4,0,1,1,1,0
5,1,1,0,0,0
6,0,0,0,1,1
7,0,0,0,0,0
8,1,0,0,0,0
9,0,0,1,1,1


In [5]:
def check_item(item, df):
    t = df[item].sum(axis=1)
    t = t[t == len(item)]
    return len(t)

In [76]:
ex_comb = list(map(lambda x: frozenset(x), combinations(df_head[list(df.columns)[0:5]], r=2)))

In [3]:
ex_dict = {}
for comb in ex_comb:
    t = check_item(comb, example_df)
    ex_dict[tuple(comb)] = t
res = pd.DataFrame(ex_dict.items(), columns=['Набор','Кол-во'])
res.to_excel('../data/set2.xlsx', index=False)

NameError: name 'ex_comb' is not defined

In [95]:
res

Unnamed: 0,Набор,Кол-во
0,"(!Women Art Revolution, 'Gator Bait)",1
1,"(!Women Art Revolution, 'Twas the Night Before...",2
2,"(...And God Created Woman, !Women Art Revolution)",0
3,"(!Women Art Revolution, 00 Schneider - Jagd au...",2
4,"('Gator Bait, 'Twas the Night Before Christmas)",1
5,"(...And God Created Woman, 'Gator Bait)",2
6,"(00 Schneider - Jagd auf Nihil Baxter, 'Gator ...",0
7,"(...And God Created Woman, 'Twas the Night Bef...",2
8,"(00 Schneider - Jagd auf Nihil Baxter, 'Twas t...",3
9,"(...And God Created Woman, 00 Schneider - Jagd...",2


In [100]:
comb3 = list(combinations(('00 Schneider - Jagd auf Nihil Baxter', "'Twas the Night Before Christmas"), r=1))
print(comb3, sep='\n')

[('00 Schneider - Jagd auf Nihil Baxter',), ("'Twas the Night Before Christmas",)]


In [102]:
check_item(['00 Schneider - Jagd auf Nihil Baxter', "'Twas the Night Before Christmas"],example_df)

3

In [103]:
check_item(['00 Schneider - Jagd auf Nihil Baxter'],example_df)

5

In [74]:
example_df[{'!Women Art Revolution', "'Twas the Night Before Christmas"}].sum(axis=1)

  example_df[{'!Women Art Revolution', "'Twas the Night Before Christmas"}].sum(axis=1)


0    2
1    0
2    0
3    2
4    1
5    1
6    0
7    0
8    1
9    1
dtype: int64

In [62]:
t.values

array([2, 2], dtype=int64)

In [None]:
pd.DataFrame(
    columns=['Набор', 'Кол-во']
)

In [3]:
res_df = apriori(df, min_support=0.07)

In [5]:
res_df

Unnamed: 0,support,itemsets
0,0.129657,"(20,000 Leagues Under the Sea)"
1,0.129657,(2001: A Space Odyssey)
2,0.298063,(48 Hrs.)
3,0.292101,(5 Card Stud)
4,0.093890,(A Brief History of Time)
...,...,...
91003,0.071535,"(Monsoon Wedding, Solaris, Terminator 3: Rise ..."
91004,0.073025,"(Monsoon Wedding, Solaris, Terminator 3: Rise ..."
91005,0.071535,"(Monsoon Wedding, Solaris, Terminator 3: Rise ..."
91006,0.071535,"(Monsoon Wedding, Solaris, Terminator 3: Rise ..."


In [6]:
keys = res_df["itemsets"].values
values = res_df["support"].values

In [7]:
keys

array([frozenset({'20,000 Leagues Under the Sea'}),
       frozenset({'2001: A Space Odyssey'}), frozenset({'48 Hrs.'}), ...,
       frozenset({'Monsoon Wedding', 'Solaris', 'Terminator 3: Rise of the Machines', 'Three Colors: Red', 'Rain Man', 'The Million Dollar Hotel', 'Titanic', 'Sissi'}),
       frozenset({'Monsoon Wedding', 'Solaris', 'Terminator 3: Rise of the Machines', 'Three Colors: Red', 'The Million Dollar Hotel', 'The Hours', 'Titanic', 'Sissi'}),
       frozenset({'Solaris', 'Terminator 3: Rise of the Machines', 'Rain Man', 'The Million Dollar Hotel', 'The Hours', 'Titanic', 'Sissi', 'Reservoir Dogs'})],
      dtype=object)

In [8]:
values

array([0.12965723, 0.12965723, 0.29806259, ..., 0.07153502, 0.07153502,
       0.07004471])

In [9]:
frozenset_vect = np.vectorize(lambda x: frozenset(x))

In [10]:
frozenset_vect(keys)

array([frozenset({'20,000 Leagues Under the Sea'}),
       frozenset({'2001: A Space Odyssey'}), frozenset({'48 Hrs.'}), ...,
       frozenset({'Monsoon Wedding', 'Solaris', 'Terminator 3: Rise of the Machines', 'Three Colors: Red', 'Rain Man', 'The Million Dollar Hotel', 'Titanic', 'Sissi'}),
       frozenset({'Monsoon Wedding', 'Solaris', 'Terminator 3: Rise of the Machines', 'Three Colors: Red', 'The Million Dollar Hotel', 'The Hours', 'Titanic', 'Sissi'}),
       frozenset({'Solaris', 'Terminator 3: Rise of the Machines', 'Rain Man', 'The Million Dollar Hotel', 'The Hours', 'Titanic', 'Sissi', 'Reservoir Dogs'})],
      dtype=object)

In [11]:
frequent_items_dict = dict(zip(frozenset_vect(keys), values))
frequent_items_dict

{frozenset({'20,000 Leagues Under the Sea'}): 0.12965722801788376,
 frozenset({'2001: A Space Odyssey'}): 0.12965722801788376,
 frozenset({'48 Hrs.'}): 0.29806259314456035,
 frozenset({'5 Card Stud'}): 0.29210134128166915,
 frozenset({'A Brief History of Time'}): 0.09388971684053651,
 frozenset({'A Clockwork Orange'}): 0.15052160953800298,
 frozenset({'A Nightmare on Elm Street'}): 0.2667660208643815,
 frozenset({'A River Runs Through It'}): 0.19523099850968703,
 frozenset({'A Time to Kill'}): 0.08941877794336811,
 frozenset({'Aguirre: The Wrath of God'}): 0.13263785394932937,
 frozenset({'Aliens vs Predator: Requiem'}): 0.13412816691505217,
 frozenset({'All the Way Boys'}): 0.23695976154992549,
 frozenset({'Almost Famous'}): 0.10134128166915052,
 frozenset({'American Pie'}): 0.07004470938897168,
 frozenset({'American Pie 2'}): 0.07600596125186289,
 frozenset({'Anatomy of Hell'}): 0.07004470938897168,
 frozenset({'And Then There Were None'}): 0.19076005961251863,
 frozenset({'Ariel'}):

In [18]:
list(frequent_items_dict.keys())[1000]

frozenset({'Back to the Future Part II', 'The 39 Steps'})

In [24]:
k1 = frozenset({'Back to the Future Part II', 'The 39 Steps'})

In [25]:
k1

frozenset({'Back to the Future Part II', 'The 39 Steps'})

In [30]:
all_subsets = subsets({'Back to the Future Part II', 'The 39 Steps'})

In [31]:
condition = all_subsets[0]
consequence = k1.difference(condition)

In [32]:
consequence

frozenset({'The 39 Steps'})

In [33]:
condition

('Back to the Future Part II',)

In [34]:
k1

frozenset({'Back to the Future Part II', 'The 39 Steps'})

In [1]:
from src.apriori import association_rules
import pandas as pd

In [2]:
df = pd.read_csv('../data/movies.csv')
res_df = association_rules(df, min_support=0.07, min_confidence=0.6)
res_df

Unnamed: 0,condition,consequence,condition support,consequence support,support,confidence,lift,leverage
0,"(20,000 Leagues Under the Sea)",(A Nightmare on Elm Street),0.129657,0.266766,0.081967,0.632184,2.369807,0.047379
1,"(20,000 Leagues Under the Sea)",(Monsoon Wedding),0.129657,0.406855,0.090909,0.701149,1.723338,0.038157
2,"(20,000 Leagues Under the Sea)",(Rain Man),0.129657,0.293592,0.086438,0.666667,2.270728,0.048372
3,"(20,000 Leagues Under the Sea)",(Silent Hill),0.129657,0.315946,0.078987,0.609195,1.928161,0.038022
4,"(20,000 Leagues Under the Sea)",(Sissi),0.129657,0.315946,0.086438,0.666667,2.110063,0.045473
...,...,...,...,...,...,...,...,...
1043320,"(Rain Man, Terminator 3: Rise of the Machines,...",(Solaris),0.074516,0.450075,0.070045,0.940000,2.088543,0.036507
1043321,"(Rain Man, Terminator 3: Rise of the Machines,...",(The Million Dollar Hotel),0.076006,0.463487,0.070045,0.921569,1.988336,0.034817
1043322,"(Rain Man, Terminator 3: Rise of the Machines,...",(The Hours),0.081967,0.301043,0.070045,0.854545,2.838614,0.045369
1043323,"(Rain Man, The Hours, The Million Dollar Hotel...",(Terminator 3: Rise of the Machines),0.074516,0.481371,0.070045,0.940000,1.952755,0.034175


In [12]:
res_df.sort_values('lift', ascending=False)

Unnamed: 0,condition,consequence,condition support,consequence support,support,confidence,lift,leverage
23421,(On Guard),"(The Garden of Eden, Muxmäuschenstill)",0.087928,0.099851,0.070045,0.796610,7.977991,0.061265
23423,"(The Garden of Eden, Muxmäuschenstill)",(On Guard),0.099851,0.087928,0.070045,0.701493,7.977991,0.061265
69839,"(Big Fish, Rope)","(Psycho, All the Way Boys)",0.092399,0.095380,0.070045,0.758065,7.947833,0.061232
69836,"(Psycho, All the Way Boys)","(Big Fish, Rope)",0.095380,0.092399,0.070045,0.734375,7.947833,0.061232
1029284,"(48 Hrs., A Nightmare on Elm Street, Cockles a...","(Sissi, Rain Man, The Hours, Monsoon Wedding, ...",0.093890,0.098361,0.071535,0.761905,7.746032,0.062300
...,...,...,...,...,...,...,...,...
228,(And Then There Were None),(Terminator 3: Rise of the Machines),0.190760,0.481371,0.117735,0.617188,1.282145,0.025908
9175,"(The Passion of Joan of Arc, Arlington Road)",(Terminator 3: Rise of the Machines),0.140089,0.481371,0.086438,0.617021,1.281800,0.019003
1213,(Rope),(Terminator 3: Rise of the Machines),0.281669,0.481371,0.172876,0.613757,1.275018,0.037289
588,(Donnie Darko),(Terminator 3: Rise of the Machines),0.138599,0.481371,0.084948,0.612903,1.273245,0.018230


In [11]:
length = res_df['condition'].apply(lambda x: len(x))
length

KeyError: 'condition'

In [7]:
check_item("'Twas the Night Before Christmas", df)

ValueError: No axis named 1 for object type Series

In [9]:
df["'Twas the Night Before Christmas"].sum(axis=0)

2

In [None]:
df['']