# Content
0. Initialisation
1. Calculate support, confidence and lift for most read articles
2. Test out different support values for the apriori analysis
3. Conclusion: Final support value


# 0. Initialisation

In [11]:
import logging
import os
import sys
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
from itertools import combinations

## Create products bought together
## https://towardsdatascience.com/apriori-association-rule-mining-explanation-and-python-implementation-290b42afdfc6

In [12]:
# functions for calculating support
def support_a_and_b(df,article_id_a,article_id_b):
    return len(df[(df[article_id_a]==True) & (df[article_id_b]==True)])/len(df)

In [13]:
# functions for calculation frequent item sets
def return_df_for_apriori(_df):
    user_transactions_df = pd.pivot_table(_df,values='original_timestamp',index='user_id',columns='base_article_id',aggfunc='count')
    user_transactions_df = user_transactions_df.fillna(0)

    column_names = user_transactions_df.columns
    return user_transactions_df[column_names].astype(bool)

def return_freq_items_of_more_than_1(_df,min_support=0.05):
    freq_items = apriori(_df, min_support=min_support, use_colnames=True)
    freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
    freq_items = freq_items[freq_items['length']>1].reset_index(drop=True)
    freq_items = freq_items.drop(['length'],axis=1)
    return freq_items

In [15]:
### Make sure to run analysis_of_evidence_log first to create the newest version of the reduced evidence_log
reduced_evidence_log_df = pd.read_csv('reduced_evidence_log.csv')
reduced_evidence_log_df.head()

Unnamed: 0,base_article_id,base_article_slug,user_id,original_timestamp,month_year
0,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,00CE41ED-45EA-4C34-9148-F9B7027F71F0,2022-02-23 12:19:27+00:00,2022-02
1,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,05D19244-6CF2-4970-AC92-30FD92756668,2021-12-31 21:52:43+00:00,2021-12
2,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,0899017E-BDCE-4A39-BF61-469117EF24E7,2022-06-27 09:47:39+00:00,2022-06
3,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,0A0FA8AF-F49A-4880-B58B-188F2C565CB2,2022-04-08 22:25:20+00:00,2022-04
4,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,0B8F8784-5D18-4CF9-A32D-2C2CB5CFD397,2022-03-01 14:31:33+00:00,2022-03


In [16]:
articles_read_df = reduced_evidence_log_df.groupby(['base_article_id','base_article_slug']).original_timestamp.count().reset_index().rename(columns={'original_timestamp':'occurences'})
articles_read_df.head()

Unnamed: 0,base_article_id,base_article_slug,occurences
0,001db2b8-07ed-4811-a72e-3e992cada878,ved-du-hvad-aeglosningsfasen-indeholder-iui,86
1,010aa277-01f4-4698-a91c-1b4456fd5921,facts-om-nedregulering,12
2,0178bdc9-01b4-453e-9075-79a3d38cd89b,vi-har-spurgt-vores-community-om-okonomi-og-fe...,59
3,01cdaaba-f150-44b6-87bb-ac098b12f752,how-can-you-improve-your-own-chances-of-pregnancy,1
4,024bc19c-b7ac-477e-824e-fe5f9690583e,hvem-kan-fa-tilbudt-insemination-med-partners-...,40


In [17]:
articles_read_by_user_df = reduced_evidence_log_df \
    .groupby('user_id') \
        .base_article_id.count() \
            .reset_index() \
                .rename(columns={'base_article_id':'number_of_articles_read_per_user'}) \
                    .sort_values(by='number_of_articles_read_per_user',ascending=False)

articles_read_by_user_df.describe()

Unnamed: 0,number_of_articles_read_per_user
count,1599.0
mean,8.181989
std,10.912843
min,1.0
25%,2.0
50%,4.0
75%,10.0
max,149.0


# 1. Analysing support and confidence for most read articles

* Support for article a refers to the popularity of an item and can be calculated by finding the number of transactions containing a particular item divided by the total number of transactions.

support(article_a) = (unique users having read article a) / (total number of unique users)

* Support for article a and b refers to the popularity of a and b

support(article_a and article_b) = (unique users having read article a and article b) / (total number of unique users)

* Confidence for article a and b refers to the likelihood that an article b is also read if article a is read.  It can be calculated by finding the number of transactions where a and b are read by the same user, divided by the total number of transactions where A is read.

confidence(article_a => article_b) = support(article_a and article_b) / support(a)

Source: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [19]:
# AVERAGE EXPECTED SUPPORT
articles_read_df.occurences.mean()/reduced_evidence_log_df.user_id.nunique()

0.012744530752280924

In [7]:
# Calculating support and confidence for the top 5 most read articles
user_transactions_df = return_df_for_apriori(reduced_evidence_log_df)
top5_df = articles_read_df.sort_values(by='occurences',ascending=False)[0:5].reset_index()
top5_df['support'] = top5_df['occurences']/len(user_transactions_df)
cc = list(combinations(articles_read_df.sort_values(by='occurences',ascending=False)[0:5].base_article_id,2))
top5_df = pd.DataFrame(cc,columns=['article_id_a','article_id_b'])
top5_df = top5_df.merge(articles_read_df,left_on='article_id_a',right_on='base_article_id').drop(columns=['base_article_id']).rename(columns={'base_article_slug':'slug_a','occurences':'occurences_a'})
top5_df = top5_df.merge(articles_read_df,left_on='article_id_b',right_on='base_article_id').drop(columns=['base_article_id']).rename(columns={'base_article_slug':'slug_b','occurences':'occurences_b'})
top5_df['support_a'] = top5_df['occurences_a']/len(user_transactions_df)*100
top5_df['support_b'] = top5_df['occurences_b']/len(user_transactions_df)*100
top5_df['support_a_and_b'] = top5_df.apply(lambda row: support_a_and_b(user_transactions_df,row['article_id_a'],row['article_id_b']), axis=1)*100
top5_df['confidence_a_and_b'] = top5_df.apply(lambda row: row['support_a_and_b']/row['support_a'], axis=1)*100

top5_df = top5_df[[ \
       'article_id_a', 'article_id_b', 'slug_a', 'slug_b', \
       'occurences_a', 'occurences_b', 'support_a', 'support_b', \
       'support_a_and_b', 'confidence_a_and_b' \
       ]]
top5_df


Unnamed: 0,article_id_a,article_id_b,slug_a,slug_b,occurences_a,occurences_b,support_a,support_b,support_a_and_b,confidence_a_and_b
0,4c36389a-ede4-4da2-a5c8-661c2fb6b977,e6e6b791-e051-44a6-a8b3-2a289a0953d5,overblik-over-medicinkategorier-og-hvad-de-gor...,rad-til-parforhold-i-fertilitetsbehandling,298,203,18.636648,12.695435,3.689806,19.798658
1,4c36389a-ede4-4da2-a5c8-661c2fb6b977,833d240c-73fc-4b4f-a814-ef9bb96e9d2f,overblik-over-medicinkategorier-og-hvad-de-gor...,disse-begivenheder-skal-du-igennem-i-fertilite...,298,196,18.636648,12.257661,3.43965,18.456376
2,e6e6b791-e051-44a6-a8b3-2a289a0953d5,833d240c-73fc-4b4f-a814-ef9bb96e9d2f,rad-til-parforhold-i-fertilitetsbehandling,disse-begivenheder-skal-du-igennem-i-fertilite...,203,196,12.695435,12.257661,2.501563,19.704433
3,4c36389a-ede4-4da2-a5c8-661c2fb6b977,28a5d387-5f30-42a4-8959-8a24df37ba15,overblik-over-medicinkategorier-og-hvad-de-gor...,lutealfasen-kan-foles-lang-fordi-du-haber-pa-e...,298,195,18.636648,12.195122,6.066291,32.550336
4,e6e6b791-e051-44a6-a8b3-2a289a0953d5,28a5d387-5f30-42a4-8959-8a24df37ba15,rad-til-parforhold-i-fertilitetsbehandling,lutealfasen-kan-foles-lang-fordi-du-haber-pa-e...,203,195,12.695435,12.195122,2.689181,21.182266
5,833d240c-73fc-4b4f-a814-ef9bb96e9d2f,28a5d387-5f30-42a4-8959-8a24df37ba15,disse-begivenheder-skal-du-igennem-i-fertilite...,lutealfasen-kan-foles-lang-fordi-du-haber-pa-e...,196,195,12.257661,12.195122,3.189493,26.020408
6,4c36389a-ede4-4da2-a5c8-661c2fb6b977,66ce674e-25e7-444d-943d-f357a4b376ab,overblik-over-medicinkategorier-og-hvad-de-gor...,statistik-hvad-er-sandsynligheden-for-graviditet,298,170,18.636648,10.631645,4.752971,25.503356
7,e6e6b791-e051-44a6-a8b3-2a289a0953d5,66ce674e-25e7-444d-943d-f357a4b376ab,rad-til-parforhold-i-fertilitetsbehandling,statistik-hvad-er-sandsynligheden-for-graviditet,203,170,12.695435,10.631645,1.563477,12.315271
8,833d240c-73fc-4b4f-a814-ef9bb96e9d2f,66ce674e-25e7-444d-943d-f357a4b376ab,disse-begivenheder-skal-du-igennem-i-fertilite...,statistik-hvad-er-sandsynligheden-for-graviditet,196,170,12.257661,10.631645,2.439024,19.897959
9,28a5d387-5f30-42a4-8959-8a24df37ba15,66ce674e-25e7-444d-943d-f357a4b376ab,lutealfasen-kan-foles-lang-fordi-du-haber-pa-e...,statistik-hvad-er-sandsynligheden-for-graviditet,195,170,12.195122,10.631645,3.689806,30.25641


In [8]:
# Comparing with the algoritme to ensure that we get the same numbers
user_transactions_df = return_df_for_apriori(reduced_evidence_log_df)
freq_items = apriori(user_transactions_df, min_support=0.03, use_colnames=True)
rules = association_rules(freq_items, metric="confidence", min_threshold=0.03)
rules['antecedents'] = rules['antecedents'].astype(str).str[12:-3]
rules['consequents'] = rules['consequents'].astype(str).str[12:-3]

rules[rules['consequents']=='e6e6b791-e051-44a6-a8b3-2a289a0953d5']

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,4c36389a-ede4-4da2-a5c8-661c2fb6b977,e6e6b791-e051-44a6-a8b3-2a289a0953d5,0.186366,0.126954,0.036898,0.197987,1.55951,0.013238,1.088567


#### Take aways

In [9]:
print(f"{top5_df.loc[0,'occurences_a']} unique users has read '{top5_df.loc[0,'slug_a']}', and {top5_df.loc[0,'occurences_b']} unique users has read '{top5_df.loc[0,'slug_b']}'")
print(f"The support for '{top5_df.loc[0,'slug_a']}' is: {str(round(top5_df.loc[0,'support_a'],2))}, while the support for '{top5_df.loc[0,'slug_b']}' is {str(round(top5_df.loc[0,'support_b'],2))}")
print(f"The support for '{top5_df.loc[0,'slug_a']}' and '{top5_df.loc[0,'slug_b']}' is {str(round(top5_df.loc[0,'support_a_and_b'],2))}, meaning that {str(round(top5_df.loc[0,'support_a_and_b'],2))} % of all unique users have read both articles.")

print(f"The confidence that the user will read '{top5_df.loc[0,'slug_b']}' given that a user has read '{top5_df.loc[0,'slug_a']}' is {str(round(top5_df.loc[0,'confidence_a_and_b'],2))} %")

298 unique users has read 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop', and 203 unique users has read 'rad-til-parforhold-i-fertilitetsbehandling'
The support for 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop' is: 18.64, while the support for 'rad-til-parforhold-i-fertilitetsbehandling' is 12.7
The support for 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop' and 'rad-til-parforhold-i-fertilitetsbehandling' is 3.69, meaning that 3.69 % of all unique users have read both articles.
The confidence that the user will read 'rad-til-parforhold-i-fertilitetsbehandling' given that a user has read 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop' is 19.8 %


In [10]:
max_support_df = top5_df[top5_df['support_a_and_b']==top5_df['support_a_and_b'].max()].reset_index()
print(f"Maximum support for two articles are for {round(max_support_df['support_a_and_b'][0],2)} %, and that is for articles '{max_support_df['slug_a'][0]}' and '{max_support_df['slug_b'][0]}'")

min_support_df = top5_df[top5_df['support_a_and_b']==top5_df['support_a_and_b'].min()].reset_index()
print(f"Minimum support for two articles in top 5 are for {round(min_support_df['support_a_and_b'][0],2)} %, and that is for articles '{min_support_df['slug_a'][0]}' and '{min_support_df['slug_b'][0]}'")

Maximum support for two articles are for 6.07 %, and that is for articles 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop' and 'lutealfasen-kan-foles-lang-fordi-du-haber-pa-en-graviditet'
Minimum support for two articles in top 5 are for 1.56 %, and that is for articles 'rad-til-parforhold-i-fertilitetsbehandling' and 'statistik-hvad-er-sandsynligheden-for-graviditet'


In [11]:
max_confidence_df = top5_df[top5_df['confidence_a_and_b']==top5_df['confidence_a_and_b'].max()].reset_index()
print(f"Maximum confidence for two articles are for {round(max_confidence_df['confidence_a_and_b'][0],2)} %, and that is for articles '{max_confidence_df['slug_a'][0]}' and '{max_confidence_df['slug_b'][0]}'")

min_confidence_df = top5_df[top5_df['confidence_a_and_b']==top5_df['confidence_a_and_b'].min()].reset_index()
print(f"Minimum confidence for two articles in top 5 are for {round(min_confidence_df['confidence_a_and_b'][0],2)} %, and that is for articles '{min_confidence_df['slug_a'][0]}' and '{min_confidence_df['slug_b'][0]}'")

Maximum confidence for two articles are for 32.55 %, and that is for articles 'overblik-over-medicinkategorier-og-hvad-de-gor-i-din-krop' and 'lutealfasen-kan-foles-lang-fordi-du-haber-pa-en-graviditet'
Minimum confidence for two articles in top 5 are for 12.32 %, and that is for articles 'rad-til-parforhold-i-fertilitetsbehandling' and 'statistik-hvad-er-sandsynligheden-for-graviditet'


#### Conclusion

If we want to be able to recommend more than top 5 articles only. It seems like we need to go for at support level below 1.5%. Hence, we will now test out what sets we are recommended for support levels between 0.5% and 1.5% to decide which level we want to use in the model

# 2. Find best support and filter for the apriori analysis

We examine the amount of rules we get with different support and confidence levels for three data sets type
* One where we don't change data
* One where we only look at users that have read more than one article
* One where we only look at recent articles

In [12]:
## For all articles
number_df = pd.DataFrame(columns=['min_support','min_confidence','number_of_rules'])
user_transactions_df = return_df_for_apriori(reduced_evidence_log_df)
for support in np.arange(0.01,0.06,0.01):
    for confidence in np.arange(0.1,0.6,0.1):
        freq_items_df = apriori(user_transactions_df, min_support=support, use_colnames=True)
        if freq_items_df.empty:
            number_df = pd.concat([number_df,pd.DataFrame.from_dict({'min_support':[support],'min_confidence':confidence,'number_of_rules':[0]},orient='columns')])
        else:
            rules = association_rules(freq_items_df, metric="confidence", min_threshold=confidence)
            number_df = pd.concat([number_df,pd.DataFrame.from_dict({'min_support':[support],'min_confidence':confidence,'number_of_rules':[len(rules)]},orient='columns')])

number_df


Unnamed: 0,min_support,min_confidence,number_of_rules
0,0.01,0.1,4830
0,0.01,0.2,3678
0,0.01,0.3,2748
0,0.01,0.4,2034
0,0.01,0.5,1504
0,0.02,0.1,266
0,0.02,0.2,200
0,0.02,0.3,143
0,0.02,0.4,100
0,0.02,0.5,68


In [15]:
rules = association_rules(freq_items_df, metric="confidence", min_threshold=0.3)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(252daca6-049e-449d-8593-e8f9cb7bc837),(4c36389a-ede4-4da2-a5c8-661c2fb6b977),0.094434,0.186366,0.054409,0.576159,3.091537,0.03681,1.919667
1,(4c36389a-ede4-4da2-a5c8-661c2fb6b977),(28a5d387-5f30-42a4-8959-8a24df37ba15),0.186366,0.121951,0.060663,0.325503,2.669128,0.037935,1.301784
2,(28a5d387-5f30-42a4-8959-8a24df37ba15),(4c36389a-ede4-4da2-a5c8-661c2fb6b977),0.121951,0.186366,0.060663,0.497436,2.669128,0.037935,1.618965


### Users that has read a minimum of articles between 4 and 11

In [59]:
# We see that a 50% of all user have read 4 or more articles - therefore we play around with smallers sets where users have read more than 4 articles
articles_read_by_user_df.describe()

Unnamed: 0,number_of_articles_read_per_user
count,1599.0
mean,8.181989
std,10.912843
min,1.0
25%,2.0
50%,4.0
75%,10.0
max,149.0


In [16]:
def number_of_recommendations_per_user(df, rules):
    number_of_recommendations_per_user_df = pd.DataFrame()
    for user_id in df.user_id.unique():
        unique_articles_list = list(df[df['user_id']==user_id]['base_article_id'])
        number_of_recommendations = len(rules[rules['antecedents'].isin(unique_articles_list) & np.logical_not(rules['consequents'].isin(unique_articles_list))])
        number_of_recommendations_per_user_df = pd.concat([
            number_of_recommendations_per_user_df, \
            pd.DataFrame.from_dict({'user_id':[user_id],'unique_articles':[len(unique_articles_list)],'number_of_recommendations':number_of_recommendations},orient='columns')])
    
    return number_of_recommendations_per_user_df
    

In [17]:
min_number_of_articles_number_df = pd.DataFrame(columns=['min_number_of_articles_read','min_support','min_confidence','number_of_rules'])
confidence = 0.4

for min_number_of_articles_read in np.arange(4,12,1):
    print(min_number_of_articles_read)
    _df = reduced_evidence_log_df[reduced_evidence_log_df.groupby(['user_id'])['base_article_id'].transform('count') > min_number_of_articles_read]
    # print(len(_df))
    user_transactions_df = return_df_for_apriori(_df)
    len(user_transactions_df)
    for support in np.arange(0.05,0.11,0.01):
            freq_items_df = apriori(user_transactions_df, min_support=support, use_colnames=True)
            if freq_items_df.empty:
                min_number_of_articles_number_df = pd.concat([\
                    min_number_of_articles_number_df,\
                    pd.DataFrame.from_dict({'min_number_of_articles_read':[min_number_of_articles_read],'min_support':[support],'min_confidence':confidence,'number_of_rules':[0]},orient='columns')])
            else:
                rules = association_rules(freq_items_df, metric="confidence", min_threshold=confidence)
                min_number_of_articles_number_df = pd.concat([min_number_of_articles_number_df,\
                    pd.DataFrame.from_dict({'min_number_of_articles_read':[min_number_of_articles_read],'min_support':[support],'min_confidence':confidence,'number_of_rules':[len(rules)]},orient='columns')])

min_number_of_articles_number_df


4
5
6
7
8
9
10
11


Unnamed: 0,min_number_of_articles_read,min_support,min_confidence,number_of_rules
0,4,0.05,0.4,51
0,4,0.06,0.4,21
0,4,0.07,0.4,7
0,4,0.08,0.4,4
0,4,0.09,0.4,3
0,4,0.1,0.4,2
0,5,0.05,0.4,70
0,5,0.06,0.4,34
0,5,0.07,0.4,16
0,5,0.08,0.4,8


In [63]:
# 3. How many recommendations do we give?

In [39]:
rules
rules['antecedents'] = rules['antecedents'].astype(str).str[12:-3]
rules['consequents'] = rules['consequents'].astype(str).str[12:-3]

In [49]:
user_id = '00CE41ED-45EA-4C34-9148-F9B7027F71F0'
unique_articles_list = list(reduced_evidence_log_df[reduced_evidence_log_df['user_id']==user_id]['base_article_id'])
print(f'{len(unique_articles_list)=}')
print(len(rules[rules['antecedents'].isin(unique_articles_list)]))
print(len(rules[np.logical_not(rules['consequents'].isin(unique_articles_list))]))
len(rules[rules['antecedents'].isin(unique_articles_list) & np.logical_not(rules['consequents'].isin(unique_articles_list))])

len(unique_articles_list)=41
44
88


26

In [55]:
number_of_recommendations_per_user_df.describe()

Unnamed: 0,unique_articles,number_of_recommendations
count,1599.0,1599.0
mean,8.181989,9.699812
std,10.912843,13.303756
min,1.0,0.0
25%,2.0,0.0
50%,4.0,3.0
75%,10.0,16.0
max,149.0,48.0


In [56]:
number_of_recommendations_per_user_df

Unnamed: 0,user_id,unique_articles,number_of_recommendations
0,00CE41ED-45EA-4C34-9148-F9B7027F71F0,41,26
0,05D19244-6CF2-4970-AC92-30FD92756668,77,28
0,0899017E-BDCE-4A39-BF61-469117EF24E7,15,39
0,0A0FA8AF-F49A-4880-B58B-188F2C565CB2,15,22
0,0B8F8784-5D18-4CF9-A32D-2C2CB5CFD397,70,31
...,...,...,...
0,b2b1cacf-ea6f-458a-9a15-04f92d4e9d84,1,0
0,2246099E-EADF-495E-B6CB-AF676F9BE365,1,0
0,17686624-d8bd-4c00-8f32-362c8d53ec27,1,7
0,EECCEFDF-83EC-42D8-9C00-CDBA61DEF343,1,7
