In [20]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [21]:
df = pd.read_csv("cleaned.csv")
df.describe()

Unnamed: 0.1,Unnamed: 0,Rating,Installs,Metric,Reviews,Price,minimum income,Size
count,6250.0,6250.0,6250.0,6250.0,6250.0,6250.0,6250.0,6250.0
mean,3124.5,0.831658,0.001764,0.001736,0.001665,0.000182,3.7e-05,0.019048
std,1804.363923,0.076868,0.003581,0.003499,0.003214,0.000738,0.000151,0.014349
min,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0
25%,1562.25,0.765541,2e-06,2e-06,1e-06,0.0,0.0,0.00599
50%,3124.5,0.840868,0.0001,9.5e-05,4.3e-05,0.0,0.0,0.019081
75%,4686.75,0.890314,0.002,0.001856,0.000727,0.0,0.0,0.025354
max,6249.0,1.0,0.011069,0.010809,0.008571,0.003176,0.000648,0.063627


In [22]:
df.columns

Index(['Unnamed: 0', 'App', 'App Id', 'Category', 'Content Rating', 'Rating',
       'Installs', 'Metric', 'Reviews', 'Price', 'Currency', 'minimum income',
       'Ad Supported', 'In app purchases', 'Size', 'Version', 'Last update',
       'Released', 'Android version Text', 'Editor Choice', 'Developer Id',
       'Developer', 'Developer Address', 'RatingCategorized',
       'InstallsCategorized', 'MetricCategorized', 'ReviewsCategorized',
       'PriceCategorized', 'minimum incomeCategorized', 'SizeCategorized'],
      dtype='object')

In [23]:
df = df.drop(
    columns=[
        "Unnamed: 0",
        "minimum income",
        "minimum incomeCategorized",
    ],
    axis=1,
)

In [24]:
def show_types(df):
    for col in df.columns:
        print(col, "=", df[col][0], "=", type(df[col][0]))


show_types(df)

App = minecraft = <class 'str'>
App Id = com.mojang.minecraftp = <class 'str'>
Category = arcad = <class 'str'>
Content Rating = everyone 10+ = <class 'str'>
Rating = 0.9111484 = <class 'numpy.float64'>
Installs = 0.002 = <class 'numpy.float64'>
Metric = 0.0046289883201103 = <class 'numpy.float64'>
Reviews = 0.0085714869305811 = <class 'numpy.float64'>
Price = 0.0031764456445644 = <class 'numpy.float64'>
Currency = usd = <class 'str'>
Ad Supported = True = <class 'numpy.bool_'>
In app purchases = True = <class 'numpy.bool_'>
Size = 0.0253542472483757 = <class 'numpy.float64'>
Version = 1.16.221.01 = <class 'str'>
Last update = 21-apr-21 = <class 'str'>
Released = 15-aug-11 = <class 'str'>
Android version Text = varies with devic = <class 'str'>
Editor Choice = True = <class 'numpy.bool_'>
Developer Id = 4.77e+18 = <class 'str'>
Developer = mojang = <class 'str'>
Developer Address = mojang
maria skolgata 83
118 53
stockholm
sweden = <class 'str'>
RatingCategorized = Very High = <class '

In [25]:
fp_cols = [
    "App",
    "App Id",
    "Category",
    "Content Rating",
    "Currency",
    "Ad Supported",
    "In app purchases",
    "Version",
    "Last update",
    "Released",
    "Android version Text",
    "Editor Choice",
    "Developer",
    "Developer Address",
    "RatingCategorized",
    "InstallsCategorized",
    "MetricCategorized",
    "ReviewsCategorized",
    "PriceCategorized",
    "SizeCategorized",
]

In [26]:
# apriori doesn't work with numeric data , so we'll keep categorical and bool data only
# and using get_dummies function we'll make it applyable for the apriori function
df = df[fp_cols]
df = pd.get_dummies(df, columns=fp_cols)
df.columns

Index(['App_#name?', 'App_.r', 'App_10 best foods for y',
       'App_10 wpm amateur ham radio cw morse code train',
       'App_100 doors of reveng', 'App_101 c programming problem', 'App_11st',
       'App_1800 contacts - lens stor', 'App_1line – one line with one touch',
       'App_20 minuten (ch)',
       ...
       'ReviewsCategorized_Low', 'ReviewsCategorized_Very High',
       'ReviewsCategorized_Very Low', 'PriceCategorized_Very High',
       'PriceCategorized_Very Low', 'SizeCategorized_High',
       'SizeCategorized_Low', 'SizeCategorized_Medium',
       'SizeCategorized_Very High', 'SizeCategorized_Very Low'],
      dtype='object', length=18538)

In [27]:
min_support = 1
frequent_itemsets = []
while len(frequent_itemsets) < 100:
    min_support -= 0.01

    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
    frequent_itemsets = frequent_itemsets[
        frequent_itemsets["itemsets"].apply(lambda x: len(x) >= 2)
    ]
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)
frequent_itemsets.to_csv("frequent_itemsets.csv")
print(f"Min support = {min_support}")
frequent_itemsets

Min support = 0.7599999999999998


Unnamed: 0,support,itemsets
12,0.98016,"(Currency_usd, Ad Supported_True)"
18,0.95840,"(Editor Choice_False, Ad Supported_True)"
22,0.94272,"(PriceCategorized_Very Low, Ad Supported_True)"
39,0.93856,"(Editor Choice_False, Currency_usd, Ad Support..."
13,0.93856,"(Editor Choice_False, Currency_usd)"
...,...,...
110,0.76528,"(MetricCategorized_Very Low, Ad Supported_True..."
102,0.76528,"(MetricCategorized_Very Low, Ad Supported_True..."
86,0.76528,"(PriceCategorized_Very Low, Editor Choice_Fals..."
32,0.76400,"(ReviewsCategorized_Very Low, PriceCategorized..."


In [28]:
minimum_threshold = 1
rules = pd.DataFrame()

while len(rules) < 30:
    minimum_threshold -= 0.01
    rules = association_rules(
        frequent_itemsets,
        metric="lift",
        min_threshold=minimum_threshold,
        support_only=True,
    )
print(f"Min threshhold = {minimum_threshold}")

Min threshhold = 0.8999999999999999


In [29]:
rules = rules.sort_values(by="support", ascending=False)
rules.to_csv("rules.csv")

In [30]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Currency_usd),(Ad Supported_True),,,0.98016,,,,,
1,(Ad Supported_True),(Currency_usd),,,0.98016,,,,,
2,(Editor Choice_False),(Ad Supported_True),,,0.9584,,,,,
3,(Ad Supported_True),(Editor Choice_False),,,0.9584,,,,,
4,(PriceCategorized_Very Low),(Ad Supported_True),,,0.94272,,,,,
5,(Ad Supported_True),(PriceCategorized_Very Low),,,0.94272,,,,,
10,(Currency_usd),"(Editor Choice_False, Ad Supported_True)",,,0.93856,,,,,
13,(Currency_usd),(Editor Choice_False),,,0.93856,,,,,
12,(Editor Choice_False),(Currency_usd),,,0.93856,,,,,
11,(Ad Supported_True),"(Editor Choice_False, Currency_usd)",,,0.93856,,,,,
