In [3]:
# -*- coding: utf-8 -*-
"""
Task 1.ipynb
"""

import pandas as pd
import math
from mlxtend.frequent_patterns import apriori, association_rules

# reading data into Data Frame
data_frame = pd.read_csv('Task 0 Output.csv')

def transform(num):
  """
  This function transforms integers into binary

  Parameters:
    num (int): an integer number

  Returns:
    1 if num > 0
    0 if num = 0
    num otherwise
  """
  if type(num) is int:
    if num > 0:
      return 1
    else:
      return 0
  return num

# apply transformation to numbers in data frame
data_frame = data_frame.applymap(transform)

# dropping index from Data Frame to call apriori()
data_frame.reset_index(drop=True, inplace=True)

# dropping date from Data Frame to call apriori()
data_frame.drop('Date', axis=1, inplace=True)

# generating k-itemsets where 1 <= k <= 10
all_itemsets = apriori(data_frame, min_support=0.000001, use_colnames=True)

# computing length of itemsets to eliminate 1-itemsets
all_itemsets['length'] = all_itemsets['itemsets'].apply(lambda x: len(x))

# extracting itemsets with at least 2 items
two_itemsets = all_itemsets[all_itemsets['length'] >= 2]

# selecting 10 itemsets with highest support
frequent_itemsets = two_itemsets.sort_values(by='support', ascending=False).head(10)

# printing itemsets
print(frequent_itemsets)

frequent_itemsets.to_csv("frequent itemsets.csv")

# generating rules for every itemset
rule_df = association_rules(all_itemsets, metric='confidence', min_threshold=0.1)

# selecting rules for the itemsets with highest support
frequent_itemset_rules = rule_df.sort_values(by='support', ascending=False).head(20)

# selecting 5 of the 20 rules with the highest confidence
highest_confidence_rules = frequent_itemset_rules.sort_values(by='confidence', ascending=False).head(5)

# selecting 5 of the 20 rules with the lowest confidence
lowest_confidence_rules = frequent_itemset_rules.sort_values(by='confidence', ascending=True).head(5)

# printing rules
print(highest_confidence_rules)
print(lowest_confidence_rules)

highest_confidence_rules.to_csv("highest confidence rules.csv")
lowest_confidence_rules.to_csv("lowest confidence rules.csv")

# initializing lists of measures of interest
odds_ratio_list = []
jaccard_list = []
cosine_list = []
interest_list = []
correlation_list = []

# computing 5 measures of interest for the highest confidence rules
for index, row in highest_confidence_rules.iterrows():
  f11 = row['antecedent support'] + row['consequent support']
  f10 = row['antecedent support'] + (1 - row['consequent support'])
  f01 = (1 - row['antecedent support']) + row['consequent support']
  f00 = (1 - row['antecedent support']) + (1 - row['consequent support'])

  fp1 = f11 + f01
  fp0 = f10 + f00
  f1p = f11 + f10
  f0p = f01 + f00
  N = fp1 + fp0

  # computing measures of interest
  odds_ratio = (f11 * f00) / (f10 * f01)
  jaccard = f11 / (f1p + fp1 - f11)
  cosine = f11 / (math.sqrt(f1p * fp1))
  interest = (N * f11) / (f1p * fp1)
  correlation = ((N * f11) - (f1p * fp1)) / math.sqrt(f1p * fp1 * f0p * fp0)

  # adding measures of interest to respective lists
  odds_ratio_list.append(odds_ratio)
  jaccard_list.append(jaccard)
  cosine_list.append(cosine)
  interest_list.append(interest)
  correlation_list.append(correlation)

# appending list of measures of interest to Data Frame
highest_confidence_rules['odds ratio'] = odds_ratio_list
highest_confidence_rules['jaccard'] = jaccard_list
highest_confidence_rules['cosine'] = cosine_list
highest_confidence_rules['interest'] = interest_list
highest_confidence_rules['correlation'] = correlation_list

# displaying updated Data Frame
print(highest_confidence_rules)

  and should_run_async(code)


     support        itemsets  length
49  0.483995  (CCOEY, NCBDY)       2
40  0.370038      (EA, TTWO)       2
19  0.367478    (MSFT, SONY)       2
35  0.366197    (ATVI, TTWO)       2
29  0.346991    (TTWO, MSFT)       2
34  0.344430      (ATVI, EA)       2
25  0.332907   (TCEHY, SONY)       2
32  0.331626   (TCEHY, MSFT)       2
11  0.330346   (NTDOY, MSFT)       2
27  0.329065    (ATVI, MSFT)       2
   antecedents consequents  antecedent support  consequent support   support  \
79     (NCBDY)     (CCOEY)            0.483995            0.483995  0.483995   
78     (CCOEY)     (NCBDY)            0.483995            0.483995  0.483995   
60        (EA)      (TTWO)            0.480154            0.505762  0.370038   
19      (SONY)      (MSFT)            0.478873            0.517286  0.367478   
50      (ATVI)      (TTWO)            0.477593            0.505762  0.366197   

    confidence      lift  leverage  conviction  zhangs_metric  
79    1.000000  2.066138  0.249744         inf  

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
