In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from apyori import apriori         #For association rule mining using apriori
from prefixspan import PrefixSpan  #For sequence pattern mining using prefixspan
import csv
from collections import Counter, defaultdict

from surprise import reader, Dataset, accuracy        #For fund recommendations
from surprise.model_selection import KFold, cross_validate
from surprise import SVD,SVDpp, SlopeOne, NMF, NormalPredictor, BaselineOnly, CoClustering
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Summary
In this notebook we are only considering the funds. <br>
Note that there are 3 types of credits that can be donated to the Leuven Univerity Fund (LUF), namely <b>Funds</b>, <b>Chairs</b>, & <b>Other Credits</b>. <br>
There are <b>242</b> different funds that can be donated to the LUF. <br>
Some Donors (Business Partners) donate only to a single fund, some donate to multiple funds, while others don't donate to funds but instead they donate to chairs and other credits.<br><br>
Three main outcomes from this notebook:<br>
-  Find the <i>associations</i> between funds<br>
-  Find the <i>sequence</i> of fund donations<br>
-  <i>Recommend</i> funds for donors<br>

<br>
Note: Sometimes two or more donors make a joint donation to the LUF. For the tasks carried out in this notebook, these joint donations are treated separately. As in, suppose if Donor A & Donor B made a joint donation to Fund F, then it is considered that both Donor A & Donor B have made a separate donation to Fund F.<br><br>

The data for this notebook is generated using MySQL, and is named question3.csv. The file contains the list of donors who have donated for various funds. <br>
The main features for this table are: <br>
-  <b>PARTNER</b>: Contains the donor id<br>
-  <b>ZZAMNT_PAYED</b>: Contains the donation sum<br>
-  <b>ORDER_GUID</b>: Contains the id of the order line of the donation<br>
-  <b>CREDIT_DESCRIPTION</b>: Contains the name of the fund<br>
-  <b>ZLUF_DOCDATE</b>: Contains the date of donation<br>
<br>
ZZAMNT_PAYED doesn't take into account of joint donation. As in, both Donor A & Donor B donates 1000 Euro for Fund F, the data shows that Donor A donated 1000 Euro for Fund F and donor B donoated 1000 Euro for Fund F. <br>
This needs to be normalized, and that is done using the variable <b>ZZAMNT_PAYED_NORM</b>.

In [3]:
funds = pd.read_csv('sqldata/question3.csv',dtype = str)
print(funds.shape)
funds.head()

(88735, 16)


Unnamed: 0,PARTNER,ZZGEBOORTEJAAR,TYPE,BIRTHPL,BU_LANGU,NATIO,POST_CODE,COUNTRY,TITLE_ACADEMDIC,TITLE,ORDER_GUID,ZZAMNT_PAYED,ZLUF_DOCDATE,ZLUF_DANKBRF,CREDIT_CATEGORY,CREDIT_DESCRIPTION
0,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D110AB54488A59,1250.0,2015-06-16 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds
1,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D0F1F52ABF8A59,4399.1,2015-04-22 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds
2,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D08835DB52CA59,5750.0,2014-11-26 00:00:00,2014-12-17 00:00:00,1,Farmaleuven Fonds
3,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8CF6038F7B08A59,11500.0,2013-06-17 00:00:00,2013-08-19 00:00:00,1,Farmaleuven Fonds
4,200000010,0,2,,,,3001,BE,,,2294EB49A90B1ED5B8D006AE95940A59,500.0,2014-06-17 00:00:00,2014-07-14 00:00:00,1,Science@Leuven Fund


In [4]:
#Check for inconsistency among the data. One inconsistency could be if the donation sum is less than 0
funds['ZZAMNT_PAYED'] = funds['ZZAMNT_PAYED'].astype('float64')
funds[funds['ZZAMNT_PAYED'] < 0]

Unnamed: 0,PARTNER,ZZGEBOORTEJAAR,TYPE,BIRTHPL,BU_LANGU,NATIO,POST_CODE,COUNTRY,TITLE_ACADEMDIC,TITLE,ORDER_GUID,ZZAMNT_PAYED,ZLUF_DOCDATE,ZLUF_DANKBRF,CREDIT_CATEGORY,CREDIT_DESCRIPTION
84685,201222693,0,1,,N,,8200,BE,,2,2294EB49A90D1ED7A78F46F0734B400F,-675.0,2017-08-10 00:00:00,,1,Fonds Digestieve Oncologie
84686,201222694,0,1,,,,8200,BE,,1,2294EB49A90D1ED7A78F46F0734B400F,-675.0,2017-08-10 00:00:00,,1,Fonds Digestieve Oncologie


As you can see there are two entries where donation amount is -675 Euro. This is also a joint donation. This data needs to be removed.

In [5]:
funds = funds[funds['ZZAMNT_PAYED'] >= 0]
funds.reset_index(drop=True,inplace=True)
print(funds.shape)
funds.head()

(88733, 16)


Unnamed: 0,PARTNER,ZZGEBOORTEJAAR,TYPE,BIRTHPL,BU_LANGU,NATIO,POST_CODE,COUNTRY,TITLE_ACADEMDIC,TITLE,ORDER_GUID,ZZAMNT_PAYED,ZLUF_DOCDATE,ZLUF_DANKBRF,CREDIT_CATEGORY,CREDIT_DESCRIPTION
0,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D110AB54488A59,1250.0,2015-06-16 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds
1,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D0F1F52ABF8A59,4399.1,2015-04-22 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds
2,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D08835DB52CA59,5750.0,2014-11-26 00:00:00,2014-12-17 00:00:00,1,Farmaleuven Fonds
3,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8CF6038F7B08A59,11500.0,2013-06-17 00:00:00,2013-08-19 00:00:00,1,Farmaleuven Fonds
4,200000010,0,2,,,,3001,BE,,,2294EB49A90B1ED5B8D006AE95940A59,500.0,2014-06-17 00:00:00,2014-07-14 00:00:00,1,Science@Leuven Fund


In [6]:
#Normalize the donation sum (ZZAMNT_PAYED)
#First get the count of how many times a particular ORDER_GUID is used & store it in ORDER_GUID_COUNT.
#Divide the ZZAMNT_PAYED by ORDER_GUID_COUNT to get normalized donation (ZZAMNT_PAYED_NORM)
df = pd.DataFrame.from_dict(Counter((funds['ORDER_GUID'])), orient='index').reset_index()
df = df.rename(columns={'index':'ORDER_GUID', 0:'ORDER_GUID_COUNT'})
funds = pd.merge(funds,df,on='ORDER_GUID',how='inner')
funds['ORDER_GUID_COUNT'] = funds['ORDER_GUID_COUNT'].astype('int32')
print(funds['ZZAMNT_PAYED'].sum())
funds['ZZAMNT_PAYED_NORM'] = funds['ZZAMNT_PAYED']/funds['ORDER_GUID_COUNT']
print(funds['ZZAMNT_PAYED_NORM'].sum()) 
print(funds.shape)
funds.head()

39854977.82
34427998.190000005
(88733, 18)


Unnamed: 0,PARTNER,ZZGEBOORTEJAAR,TYPE,BIRTHPL,BU_LANGU,NATIO,POST_CODE,COUNTRY,TITLE_ACADEMDIC,TITLE,ORDER_GUID,ZZAMNT_PAYED,ZLUF_DOCDATE,ZLUF_DANKBRF,CREDIT_CATEGORY,CREDIT_DESCRIPTION,ORDER_GUID_COUNT,ZZAMNT_PAYED_NORM
0,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D110AB54488A59,1250.0,2015-06-16 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds,1,1250.0
1,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D0F1F52ABF8A59,4399.1,2015-04-22 00:00:00,2016-03-04 00:00:00,1,Farmaleuven Fonds,1,4399.1
2,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8D08835DB52CA59,5750.0,2014-11-26 00:00:00,2014-12-17 00:00:00,1,Farmaleuven Fonds,1,5750.0
3,200000007,0,2,,,,3000,BE,,,2294EB49A90B1ED5B8CF6038F7B08A59,11500.0,2013-06-17 00:00:00,2013-08-19 00:00:00,1,Farmaleuven Fonds,1,11500.0
4,200000010,0,2,,,,3001,BE,,,2294EB49A90B1ED5B8D006AE95940A59,500.0,2014-06-17 00:00:00,2014-07-14 00:00:00,1,Science@Leuven Fund,1,500.0


Without normalizing the donation sum, you see that the LUF received 39,854,977.82 Euro from different funds. <br>
After normaling the donation sum, you can see that the LUF <i>actually</i> received <b>34,427,998.19 Euro</b> from different funds.

In [7]:
len(set(funds['PARTNER']))

44116

There are 56,897 different donors who have donated to LUF since 2002.<br>
Among them, 44,116 different donors have donoted for LUF funds since 2002.

In [8]:
funds['CREDIT_DESCRIPTION'].value_counts()[:10] #List of 10 most popular funds among the donors

Zuster Jeanne Devos Fonds             15242
Kinderkankerfonds Leuven              11938
Bibliotheekfonds                       4915
Fonds Sofhea Cruh                      4533
Fonds Een hart voor ALS                3753
Rondoufonds voor Duchenneonderzoek     2823
Bone4Kids Fonds                        2803
SOLFA Fonds                            2772
Leuvens kankerinstituut                2434
Fonds Kiwanis Club Leuven              2332
Name: CREDIT_DESCRIPTION, dtype: int64

In [9]:
funds['PARTNER'].value_counts()[:10] #List of top 10 donors who have donated the most number of times to LUF funds

0201087670    283
0201088307    198
0201063235    176
0201063236    175
0201103039    164
0200347493    143
0201048602    143
0201050374    141
0200011336    134
0201070509    132
Name: PARTNER, dtype: int64

In [10]:
funds['ZZAMNT_PAYED'].value_counts()[:5] #List of top 10 common amount of money which the donors donate to LUF funds

50.0     18213
40.0     11374
100.0    10335
30.0      6497
10.0      5106
25.0      4784
20.0      2721
5.0       2472
Name: ZZAMNT_PAYED, dtype: int64

In [11]:
#To get the aggregate values of a particular values in terms of Sum of Donation & Count of Donation
fund = funds[['CREDIT_DESCRIPTION','ZZAMNT_PAYED']]
f_sum = fund.groupby('CREDIT_DESCRIPTION').sum()
f_count = fund.groupby('CREDIT_DESCRIPTION').count()
f_count.columns = ['COUNT']
f = pd.merge(f_sum,f_count,on='CREDIT_DESCRIPTION',how='inner')
fund_agg = f.sort_values(['ZZAMNT_PAYED'], ascending=False)
print(fund_agg.shape)
fund_agg.head()

(221, 2)


Unnamed: 0_level_0,ZZAMNT_PAYED,COUNT
CREDIT_DESCRIPTION,Unnamed: 1_level_1,Unnamed: 2_level_1
Verelst Baarmoederkankerfonds,4726496.8,48
Kinderkankerfonds Leuven,3473980.79,11938
Zuster Jeanne Devos Fonds,2553586.42,15242
Initiatieven Theologie en Religie,1002910.9,1294
Nadine de Beauffort Fonds,955132.51,135


Among the 242 different funds that can be donated to LUF, the donors have donated to 221 of them.

# Associations between funds

### Association Rules

Association rule learning is a rule-based machine learning method for discovering interesting relations between variables in large databases. <br><br>
For example, the rule <b>{onions,potatoes} => {burger}</b> found in the sales data of a supermarket would indicate that if a customer buys onions and potatoes together, they are likely to also buy hamburger meat. <br>
Such information can be used as the basis for decisions about marketing activities such as, e.g., promotional pricing or product placements. <br>

Here only the associations between three funds is mined.<br>
Support (A,B,C) is the fraction of donors who donated to all Funds A, B & C. <br>
Confidence ({A,B} => {C}) is the liklihood of a donor donating to Fund C having donated to Funds A & B. <br>
Confidence ({A,B} => {C}) = Support (A,B) / Support (A,B,C) <br>

Here <b>Apriori</b> algorithm is used to find the associations between funds. <br>

For running the Apriori algorithm, data must be in a list of list format, like this <br>
<i>[<br>
['Fonds Herman Servotte', 'Fonds Roger Dillemans', 'Zuster Jeanne Devos Fonds', 'Fonds Em. prof. dr. A.L. Baert’], <br>
['Fonds Joseph Van de Wiele', 'Fonds Roger Dillemans', 'Fonds Kortrijk Sociaal en Internationaal', 'Kinderkankerfonds Leuven’],<br> 
['Fonds Roger Dillemans', 'Initiatieven Theologie en Religie', 'Zuster Jeanne Devos Fonds', 'Kinderkankerfonds Leuven’]<br>
]<br></i>

In the above example: <br>
Support (<i>'Fonds Roger Dillemans', 'Zuster Jeanne Devos Fonds’</i>) = 2 <br>
Support(<i>'Fonds Roger Dillemans', 'Zuster Jeanne Devos Fonds’ , 'Kinderkankerfonds Leuven’</i>) = 1 <br>
    Confidence(<i>'Fonds Roger Dillemans', 'Zuster Jeanne Devos Fonds’</i> => <i>'Kinderkankerfonds Leuven’</i>) = 1/2 <br>
<br>
For calculating association rules, we only take into account of the fund a donor donates without taking into account of the number of time a donor donates to a particular fund.

In [12]:
list_of_funds = sorted(list(set(funds['CREDIT_DESCRIPTION'])))
ar = funds[['PARTNER','CREDIT_DESCRIPTION']]
print('Before removing duplicates ');print(ar.shape)
print(ar.head())
#To remove duplicates
ar = ar.drop_duplicates() 
ar.reset_index(drop=True, inplace=True)
print('\nAfter removing duplicates ');print(ar.shape)
print(ar.head())

Before removing duplicates 
(88733, 2)
      PARTNER   CREDIT_DESCRIPTION
0  0200000007    Farmaleuven Fonds
1  0200000007    Farmaleuven Fonds
2  0200000007    Farmaleuven Fonds
3  0200000007    Farmaleuven Fonds
4  0200000010  Science@Leuven Fund

After removing duplicates 
(50026, 2)
      PARTNER      CREDIT_DESCRIPTION
0  0200000007       Farmaleuven Fonds
1  0200000010     Science@Leuven Fund
2  0200000024  KU Leuven Erfgoedfonds
3  0200000035   Fonds Roger Dillemans
4  0200000041  KU Leuven Erfgoedfonds


In [13]:
ar['PARTNER'].value_counts()[:5] #Top five donors who have donated to various different funds

0201103039    39
0201078447    34
0201083742    22
0200310511    22
0200310505    22
Name: PARTNER, dtype: int64

In [14]:
association_rules = pd.crosstab(index=ar.iloc[:,0], columns=ar.iloc[:,1], values=ar.iloc[:,1],aggfunc='sum')
association_rules['PARTNER'] = association_rules.index
association_rules.reset_index(drop=True, inplace=True)
association_rules = association_rules[list_of_funds]
print(association_rules.shape)
association_rules.tail()

(44116, 221)


CREDIT_DESCRIPTION,ADPKD Fonds,ALBERT Fonds voor betere levenskwaliteit,Arjan Fonds,Arne Loosveldt Fonds,BTV-Fonds voor darmkankeronderzoek,Bart Verbeeck Fonds,Belfius Financial Engineering Fonds,Beurzenfonds Informatiemanagement,Beurzenfonds clinical pharmacy,Bibliotheekfonds,...,Venceremos Fonds,Verelst Baarmoederkankerfonds,Vesalius Fonds Anatomisch Onderzoek,Vuylsteke-Flipts Fonds,WILL Fonds,Wilfried Martens Fonds,Zalmfonds,Zuster Jeanne Devos Fonds,fonds C.A.L.S.,project Magenta
44111,,,,,,,,,,,...,,,,,,,,,,
44112,,,,,,,,,,,...,,,,,,,,Zuster Jeanne Devos Fonds,,
44113,,,,,,,,,,,...,,,,,,,,,,
44114,,,,,,,,,,,...,,,,,,,,,,
44115,,,,,,,,,,,...,,,,,,,,,,


In [15]:
#convert dataframe to list of lists
association_rules_lofl = association_rules.values.tolist()
ar_data = []
for rule in association_rules_lofl:
    cleanedRule = [x for x in rule if str(x) != 'nan']
    ar_data.append(cleanedRule)
    
ar_data[:10]

[['Farmaleuven Fonds'],
 ['Science@Leuven Fund'],
 ['KU Leuven Erfgoedfonds'],
 ['Fonds Roger Dillemans'],
 ['KU Leuven Erfgoedfonds'],
 ['Marc Vervenne Fonds'],
 ['ALBERT Fonds voor betere levenskwaliteit',
  'Fonds Herman Servotte',
  'Fonds Myny-Vanderpoorten',
  'Fonds Roger Dillemans',
  'Fonds Universitaire Parochie',
  'Fonds Universiteit Derde Leeftijd',
  'Initiatieven Theologie en Religie',
  'Luc Sels Fonds KU Leuven studenten'],
 ['Bart Verbeeck Fonds'],
 ['Fonds Universitaire Parochie'],
 ['LUMOS']]

In [16]:
len_df = pd.DataFrame()
for data in ar_data:
    df = pd.DataFrame([len(data)])
    len_df = len_df.append(df)
len_df.reset_index(drop=True, inplace=True)
len_df[0].value_counts()[:5]

1    40338
2     2799
3      542
4      204
5      102
Name: 0, dtype: int64

As you can see, only 542 donors who have donated to 3 different funds!<br>
40,338 out of 44,116 donors have only donated to a single fund.

### Apriori
To run this algorithm we set the parameter 'min_support' = 0.00023 which corresponds to support of 10 donors (0.00023 * 44157 ~ 10) <br>
We the parameters 'min_length' and 'max_length' to 3 as we are only mining for rules containing 3 funds.

In [17]:
a_rules = apriori(ar_data, min_support=0.00023, min_confidence=0.1, min_lift=1, min_length=3,max_length=3)  
a_results = list(a_rules) 
#support(A,B,C)
#confidence(A-->B) = P(B|A) = sup({A,B}) /sup(A)
#confidence({A,B}-->C) = P(C/{A,B}) = sup({A,B,C})/sup({A,B})

In [18]:
#Store the rules in a dataframe
a_rules_df = pd.DataFrame(columns = ('SUPPORT','FUND_A','FUND_B','FUND_C','CONFIDENCE','LIFT'))
for res in a_results:
    triplets = res[0]
    t_items = [x for x in triplets]
    if(len(t_items) == 3):
        antecedent = res[2][0][0]
        a_items = [x for x in antecedent]
        consequent = res[2][0][1]
        c_items = [x for x in consequent]
        df = pd.DataFrame([[res[1], a_items[0],a_items[1],c_items[0], res[2][0][2], res[2][0][3]]], 
                          columns=('SUPPORT','FUND_A','FUND_B','FUND_C','CONFIDENCE','LIFT'))
        a_rules_df = a_rules_df.append(df)
        try:
            antecedent_2 = res[2][1][0]
            a_items_2 = [x for x in antecedent_2]
            consequent_2 = res[2][1][1]
            c_items_2 = [x for x in consequent_2]
            df = pd.DataFrame([[res[1], a_items_2[0],a_items_2[1],c_items_2[0], res[2][1][2], res[2][1][3]]], 
                              columns=('SUPPORT','FUND_A','FUND_B','FUND_C','CONFIDENCE','LIFT'))
            a_rules_df = a_rules_df.append(df)
        except:
            print('')  
a_rules_df.reset_index(drop=True, inplace=True)
a_rules_df = a_rules_df[["SUPPORT","CONFIDENCE","LIFT","FUND_A","FUND_B","FUND_C"]]
print(a_rules_df.shape)
a_rules_df.head()
#CONFIDENCE column represents p(FUND_C/{FUND_A,FUND_B})
#lift({A,B}-->C) = sup({A,B,C})/[sup({A,B})*sup(C)]
#LIFT says how likely donor donates for Fund C when already donated for Funds A & B, while controlling for how popular Fund C is 
#Lift = 1 ---> no association between Funds. 
#Lift > 1 ---> donor is more likely to donate to Fund C if already donated for Funds A & B.
#Lift < 1 ---> donor is unlikely to donate to Fund C if already donated for Funds A & B.


(237, 6)


Unnamed: 0,SUPPORT,CONFIDENCE,LIFT,FUND_A,FUND_B,FUND_C
0,0.000544,0.338028,30.874639,Bibliotheekfonds,Fonds Herman Servotte,Fonds Roger Dillemans
1,0.000544,0.461538,38.058375,Bibliotheekfonds,Fonds Roger Dillemans,Fonds Herman Servotte
2,0.000567,0.352113,24.578802,Bibliotheekfonds,Fonds Herman Servotte,Fonds Universitaire Parochie
3,0.000567,0.543478,44.815116,Bibliotheekfonds,Fonds Universitaire Parochie,Fonds Herman Servotte
4,0.000249,0.15493,27.122513,Bibliotheekfonds,Fonds Herman Servotte,Fonds voor jonge academici in Congo


In [19]:
#Save the association rules
a_rules_df.to_csv('SQLDATA/PYTHON_OUTPUTS/A_Rules_Donors.csv', encoding='latin-1')

In [20]:
#Check the top rules that have high confidence
a_rules_df[(a_rules_df['CONFIDENCE'] >= 0.8)]

Unnamed: 0,SUPPORT,CONFIDENCE,LIFT,FUND_A,FUND_B,FUND_C
163,0.00034,0.833333,4.445923,Fonds Roger Dillemans,Prof. Jacques Gruwez Fonds,Kinderkankerfonds Leuven
206,0.000272,0.8,5.658618,Mark Waer Fonds,Fonds Tom Debackere,Zuster Jeanne Devos Fonds
235,0.000249,0.846154,5.985077,Mark Waer Fonds,Onderzoek naar kanker bij zwangerschap,Zuster Jeanne Devos Fonds


# Sequence Mining

Unlike Association rule mining, Sequence mining takes into account of the sequences of the donations. <br>

The Support and Confidence measures in Sequence mining takes into account of fund sequences. <br>
Consider the following data:<br>
[<br>
['A', 'B', 'E', 'A', 'B', 'C', 'D', 'A'],<br>
['A', 'F', 'D', 'B', 'A', 'A', 'E'],<br>
['A', 'C', 'B', 'A', 'B']<br>
['B', 'A']<br>
]<br>

For the above data, in association rules, the Support(A,B) is 4. <br>
But for sequence mining, the Support (A,B) is 3. This is because in the last list, 'B' appears before 'A'. So 'A' & 'B' are not in sequence for the last list, meaning that 'A' doesn't appear before 'B'. <br>  

Similarly, in association rules, the Confidence ({A,B} => {D}) is 2/4. <br>
But for sequence mining, the Confidence ({A,B} => {D}) is 1/3. This is because in the second list 'D' appears in between 'A' & 'B', so 'A','B', & 'D' are in sequence for only in the first list.

We use the <b>PrefixSpan</b> algorithm to compute the sequences between funds. <br>

For running the PrefixSpan algorithm, data must be in a list of list format just like in Apriori algorithm. <br>

Unlike Apriori, we do not drop duplicates in PrefixSpan algorithm. <br>

The first step is to sort the donations for each donors according to the date of donations. Then convert this to list.

In [21]:
seq_data = pd.read_csv('sqldata/question3.csv',dtype = str)
list_of_funds = sorted(list(set(seq_data['CREDIT_DESCRIPTION'])))
seq = seq_data[['PARTNER','ZLUF_DOCDATE','CREDIT_DESCRIPTION']]
print(seq.shape)
seq.head()

(88735, 3)


Unnamed: 0,PARTNER,ZLUF_DOCDATE,CREDIT_DESCRIPTION
0,200000007,2015-06-16 00:00:00,Farmaleuven Fonds
1,200000007,2015-04-22 00:00:00,Farmaleuven Fonds
2,200000007,2014-11-26 00:00:00,Farmaleuven Fonds
3,200000007,2013-06-17 00:00:00,Farmaleuven Fonds
4,200000010,2014-06-17 00:00:00,Science@Leuven Fund


In [22]:
seq = seq.sort_values(['PARTNER','ZLUF_DOCDATE'],ascending=[True,True]) #Sort by user id first and then donation date
seq.reset_index(drop=True,inplace=True)
print(seq.shape)
seq.head()

(88735, 3)


Unnamed: 0,PARTNER,ZLUF_DOCDATE,CREDIT_DESCRIPTION
0,200000007,2013-06-17 00:00:00,Farmaleuven Fonds
1,200000007,2014-11-26 00:00:00,Farmaleuven Fonds
2,200000007,2015-04-22 00:00:00,Farmaleuven Fonds
3,200000007,2015-06-16 00:00:00,Farmaleuven Fonds
4,200000010,2009-12-22 00:00:00,Science@Leuven Fund


Now the donations (CREDIT_DESCRIPTION) are ordered in sequence for each donor (PARTNER) according to donation data (ZLUF_DOCDATE). <br>

Now convert this into list of lists.

In [23]:
seq_list = seq.groupby(['PARTNER'])['CREDIT_DESCRIPTION'].apply(list)
seq_lol = []   #list of lists
num_fund_donors = len(seq_list)
for i in range(num_fund_donors):
    seq_lol.append(seq_list[i])
seq_lol[:3]

[['Farmaleuven Fonds',
  'Farmaleuven Fonds',
  'Farmaleuven Fonds',
  'Farmaleuven Fonds'],
 ['Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund',
  'Science@Leuven Fund'],
 ['KU Leuven Erfgoedfonds']]

### PrefixSpan

Here too set the minimum support to 10 donors. <br>
The PrefixSpan algorithm using prefixspan package doesn't have the option to set the number of items in the sequences. We'll have to set it manually.<br>
Unlike Apriori, we do not have the option to calculate the confidence for the mined sequences. <br>
Hence the 'cal_conf_sequence' function below will calculate the confidence for the mined sequences. <br>

In [24]:
ps = PrefixSpan(seq_lol)
sup_10_freq_seq = ps.frequent(10,closed=False,generator=False)
print(sup_10_freq_seq[:5])
len(sup_10_freq_seq)

[(16, ['Farmaleuven Fonds']), (126, ['Science@Leuven Fund']), (60, ['Science@Leuven Fund', 'Science@Leuven Fund']), (40, ['Science@Leuven Fund', 'Science@Leuven Fund', 'Science@Leuven Fund']), (32, ['Science@Leuven Fund', 'Science@Leuven Fund', 'Science@Leuven Fund', 'Science@Leuven Fund'])]


2870

There are 2870 mined sequences. But we need to filter out the sequences having length 3.

In [25]:
#Confidence for sequence
def cal_conf_sequence(seq_lol,fund_a,fund_b,fund_c):
    num_lists = len(seq_lol)
    count_num = 0  #(fund_a---fund_b---fund_c)
    count_den = 0  #(fund_a---fund_b)
    for i in range(num_lists):
        len_list = len(seq_lol[i])
        a_found = 0
        b_found = 0
        c_found = 0
        num_found = 0
        den_found = 0
        for j in range(len_list):
            if((seq_lol[i][j]==fund_a) & (a_found==0)):
                a_found = 1
                continue
            if((seq_lol[i][j]==fund_b) & (a_found==1) & (b_found==0)):
                b_found = 1
            if((a_found==1) & (b_found==1) & (den_found==0)):
                count_den = count_den + 1
                den_found = 1
                continue
            if((seq_lol[i][j]==fund_c) & (a_found==1) & (b_found==1) & (c_found==0)):
                c_found = 1
            if((a_found==1) & (b_found==1) & (c_found==1) & ((num_found==0))):
                count_num = count_num + 1
                num_found = 1
    conf_ab_c = round((count_num/count_den),4)
    return(conf_ab_c)

In [26]:
#Here we select rules with 3 funds
num_seq = len(sup_10_freq_seq)
sequence_df = pd.DataFrame(columns = ('SUPPORT','FUND_A','FUND_B','FUND_C'))
for i in range(num_seq):
    if(len(sup_10_freq_seq[i][1])==3): 
        sequence_df = sequence_df.append({'SUPPORT': sup_10_freq_seq[i][0],
                                          'FUND_A':sup_10_freq_seq[i][1][0],
                                          'FUND_B':sup_10_freq_seq[i][1][1],
                                          'FUND_C':sup_10_freq_seq[i][1][2]}, 
                                         ignore_index=True)
sequence_df = sequence_df.sort_values(['SUPPORT'],ascending=[False])
sequence_df.reset_index(drop=True,inplace=True)
sequence_df['CONFIDENCE'] = 0.0

for idx, row in sequence_df.iterrows():
    sequence_df.loc[idx,'CONFIDENCE'] = cal_conf_sequence(seq_lol,row['FUND_A'],row['FUND_B'],row['FUND_C'])
sequence_df['CONFIDENCE']
sequence_df = sequence_df[['SUPPORT','CONFIDENCE','FUND_A','FUND_B','FUND_C']]
sequence_df.to_csv('SQLDATA/PYTHON_OUTPUTS/sequence_mining_all_3.csv', encoding='latin-1')
print(num_seq)
print(sequence_df.shape)
sequence_df.head()

2870
(484, 5)


Unnamed: 0,SUPPORT,CONFIDENCE,FUND_A,FUND_B,FUND_C
0,651,0.538,Zuster Jeanne Devos Fonds,Zuster Jeanne Devos Fonds,Zuster Jeanne Devos Fonds
1,415,0.6139,Fonds Kiwanis Club Leuven,Fonds Kiwanis Club Leuven,Fonds Kiwanis Club Leuven
2,341,0.3698,Kinderkankerfonds Leuven,Kinderkankerfonds Leuven,Kinderkankerfonds Leuven
3,311,0.5791,Bibliotheekfonds,Bibliotheekfonds,Bibliotheekfonds
4,273,0.5833,Fonds Sofhea Cruh,Fonds Sofhea Cruh,Fonds Sofhea Cruh


Now we have 484 sequences having number of items = 3. <br>
But there is a problem here, we see that FUND_A == FUND_B == FUND_C. This represents recurring donation. But, this is not what we are looking for. We are trying to mine sequences for 3 different funds. <br>
We will get the unique fund sequences from the calculation below. <br>

In [27]:
unique_sequence_df = sequence_df[(sequence_df['FUND_A']!=sequence_df['FUND_B']) & 
                                 (sequence_df['FUND_A']!=sequence_df['FUND_C']) &
                                 (sequence_df['FUND_B']!=sequence_df['FUND_C'])]
unique_sequence_df = unique_sequence_df.sort_values(['CONFIDENCE'],ascending=[False])
unique_sequence_df.reset_index(drop=True,inplace=True)
unique_sequence_df.to_csv('SQLDATA/PYTHON_OUTPUTS/sequence_mining_unique_3.csv', encoding='latin-1') #Save the result
print(unique_sequence_df.shape)
unique_sequence_df.head()

(157, 5)


Unnamed: 0,SUPPORT,CONFIDENCE,FUND_A,FUND_B,FUND_C
0,11,0.7333,Fonds Herman Servotte,SOLFA Fonds,Fonds Roger Dillemans
1,12,0.6667,Fonds Universitaire Parochie,Fonds Myny-Vanderpoorten,Fonds Roger Dillemans
2,16,0.64,Fonds Herman Servotte,Fonds voor jonge academici in Congo,Fonds Roger Dillemans
3,15,0.6,Fonds Herman Servotte,Fonds voor jonge academici in Congo,Fonds Universitaire Parochie
4,10,0.5882,Fonds Herman Servotte,Fonds Sofhea Cruh,Fonds Roger Dillemans


# Fund Recommendations

The goal of a Recommender System is to generate meaningful recommendations to a collection of donors for funds that might interest them. <br>

Here we do not consider the number of times a donor made a donation to a particular fund. We only consider the total amount a donor donated to a given fund. <br>
The ratings of a donor for a particular fund is based on how much donation the donor has made to a particular fund when compared to how much the donor is capable of donating. <br>

Rating scale of 0 to 1 is used: <br>
-  1 indicates that the donor would give to this fund to the max of his/her capability.<br>
-  Close to 0 indicates the opposite. <br>
-  Eg: If the total donation of a donor for 3 different funds A,B, & C are:
        € 100 for Fund A, € 400 for Fund B, & € 1000 for Fund C, then
        The rating for Fund A = 100/1000 = 0.1
        The rating for Fund B = 400/1000 = 0.4
        The rating for Fund C = 1000/1000 = 1.0

For building a more reliable recommender system we need to filter the data so that:<br> 
-  Each donor has donated to at least 2 different funds <br>
-  Each fund is donated by atleast 5 different donors <br>

This means that we are recommending the donations to only a few proportion of donors, but this is a necessary step in order to build a good recommender system. Another reason why this is necessary is that this will reduce the sparsity of donor x fund matrix meaning that the training of recommender system would happen much faster. <br>

A Recommender System is classified based on the type of filtering used in the model. There are three main types of filtering: content-based filtering, collaborative filtering, and hybrid filtering. <br>

For recommending funds, it was found that <b>User-based Collaborative filtering </b> produced the best recommendations. <br>
Collaborative filtering recommends funds by identifying other donors with similar taste; it uses their opinion to recommend funds to the active donor. <br>
User-based collaborative filtering technique calculates the similarity between donors by comparing their ratings on the same fund; then uses a weighted average of the ratings for these funds from similar donors to recommend funds. <br>

We also calculate Fund Similarity matrix, which shows the similarity of the funds as perceived by all the donors. This matrix can be used to recommend funds for donors who have only donated once. This is similar to what we see in amazon's website where it says 'users who have bought this item also bought this and this items'.

In [28]:
fund_rec_data = funds[['PARTNER','CREDIT_DESCRIPTION','ZZAMNT_PAYED_NORM']]
print(fund_rec_data.shape)
fund_rec_data.head()

(88733, 3)


Unnamed: 0,PARTNER,CREDIT_DESCRIPTION,ZZAMNT_PAYED_NORM
0,200000007,Farmaleuven Fonds,1250.0
1,200000007,Farmaleuven Fonds,4399.1
2,200000007,Farmaleuven Fonds,5750.0
3,200000007,Farmaleuven Fonds,11500.0
4,200000010,Science@Leuven Fund,500.0


In [29]:
#Sum up all the donoations made by a donor for a particular fund
col_fil_data = fund_rec_data[['PARTNER','CREDIT_DESCRIPTION','ZZAMNT_PAYED_NORM']].groupby(['PARTNER','CREDIT_DESCRIPTION']).sum()
col_fil_data.reset_index(inplace=True)
col_fil_data = col_fil_data.rename(columns={"ZZAMNT_PAYED_NORM":"T_ZZAMNT_PAYED_FUND"})
print(col_fil_data.shape)
col_fil_data.head()

(50026, 3)


Unnamed: 0,PARTNER,CREDIT_DESCRIPTION,T_ZZAMNT_PAYED_FUND
0,200000007,Farmaleuven Fonds,22899.1
1,200000010,Science@Leuven Fund,4600.0
2,200000024,KU Leuven Erfgoedfonds,20000.0
3,200000035,Fonds Roger Dillemans,27000.0
4,200000041,KU Leuven Erfgoedfonds,1000.0


T_ZZAMNT_PAYED_FUND is the total donation made by the donor for that particular fund

In [30]:
#Calculate the normalized donations made by each donor for a fund. 
minima = col_fil_data.groupby('PARTNER')['T_ZZAMNT_PAYED_FUND'].min()
col_fil_data['min_T_ZPF'] = col_fil_data['PARTNER'].map(minima)
maxima = col_fil_data.groupby('PARTNER')['T_ZZAMNT_PAYED_FUND'].max()
col_fil_data['max_T_ZPF'] = col_fil_data['PARTNER'].map(maxima)
col_fil_data['WEIGHTED_RATING'] = (col_fil_data['T_ZZAMNT_PAYED_FUND']/col_fil_data['max_T_ZPF']).round(5)
min_wr = col_fil_data.groupby('PARTNER')['WEIGHTED_RATING'].min()
col_fil_data['min_WR'] = col_fil_data['PARTNER'].map(min_wr)
print(col_fil_data.shape)
col_fil_data.head(15)
#WEIGHTED_RATING is used as ratings to train the recommender system
#min_WR indicates the minimum WEIGHTED_RATING for a donor

(50026, 7)


Unnamed: 0,PARTNER,CREDIT_DESCRIPTION,T_ZZAMNT_PAYED_FUND,min_T_ZPF,max_T_ZPF,WEIGHTED_RATING,min_WR
0,200000007,Farmaleuven Fonds,22899.1,22899.1,22899.1,1.0,1.0
1,200000010,Science@Leuven Fund,4600.0,4600.0,4600.0,1.0,1.0
2,200000024,KU Leuven Erfgoedfonds,20000.0,20000.0,20000.0,1.0,1.0
3,200000035,Fonds Roger Dillemans,27000.0,27000.0,27000.0,1.0,1.0
4,200000041,KU Leuven Erfgoedfonds,1000.0,1000.0,1000.0,1.0,1.0
5,200000054,Marc Vervenne Fonds,125.0,125.0,125.0,1.0,1.0
6,200000058,ALBERT Fonds voor betere levenskwaliteit,300.0,50.0,100000.0,0.003,0.0005
7,200000058,Fonds Herman Servotte,250.0,50.0,100000.0,0.0025,0.0005
8,200000058,Fonds Myny-Vanderpoorten,50.0,50.0,100000.0,0.0005,0.0005
9,200000058,Fonds Roger Dillemans,400.0,50.0,100000.0,0.004,0.0005


In [31]:
#Here we filter the data so that: 
    #each donor has donated to at least 2 diff funds
    #each fund is donated by atleast 5 diff donors

#Need to iterate this cell a few times until the data converges!!!!!!
for i in range(10):
    don_counts_n = pd.DataFrame(col_fil_data[['PARTNER','CREDIT_DESCRIPTION']].groupby('PARTNER').count())
    don_counts_n['PARTNER'] = don_counts_n.index
    don_counts_n.reset_index(drop=True,inplace=True)
    don_counts_n = don_counts_n.rename(columns={"CREDIT_DESCRIPTION":"DON_COUNT_BY_DONORS_TO_DIFF_FUNDS"})
    donlist_donated_to_atl_n_diff_funds = list(set(don_counts_n['PARTNER'][don_counts_n['DON_COUNT_BY_DONORS_TO_DIFF_FUNDS']>=2]))
    #print(len(donlist_donated_to_atl_n_diff_funds)) #There are 3778 diff donors who donated by at least 2 different funds

    fund_counts_n = pd.DataFrame(col_fil_data[['PARTNER','CREDIT_DESCRIPTION']].groupby('CREDIT_DESCRIPTION').count())
    fund_counts_n['CREDIT_DESCRIPTION'] = fund_counts_n.index
    fund_counts_n.reset_index(drop=True,inplace=True)
    fund_counts_n = fund_counts_n.rename(columns={'PARTNER':'FUND_COUNT_BY_DONORS'})
    fundlist_donated_by_atl_n_diff_donors = list(set(fund_counts_n['CREDIT_DESCRIPTION'][fund_counts_n['FUND_COUNT_BY_DONORS']>=5]))
    #print(len(fundlist_donated_by_atl_n_diff_donors)) #There are 138 funds which are donated by at least 5 different donors

    col_fil_data = col_fil_data[(col_fil_data['PARTNER'].isin(donlist_donated_to_atl_n_diff_funds)) & 
                                (col_fil_data['CREDIT_DESCRIPTION'].isin(fundlist_donated_by_atl_n_diff_donors))]
    col_fil_data.reset_index(drop=True,inplace=True)
    #print(col_fil_data.shape)

The sparsity of DONORxFUND matrix is reduced from 44116*221 to 3778*138!!!!!!!! Actually it is 3701* 108! <br>
That's a reduction by 24.4 times! <br>
Now the matrix is only 0.7485 or 74.85% sparse instead of 99.49%! <br>

Now we have 3778 diff donors who donated by at least 2 different funds. <br>
And 138 funds which are donated by at least 5 different donors. <br>

In [32]:
col_fil_mat = pd.crosstab(index=col_fil_data.iloc[:,0], columns=col_fil_data.iloc[:,1],
                          values=col_fil_data.iloc[:,5], aggfunc="sum")
col_fil_mat = col_fil_mat.fillna(0.0) #For finding correlation it's important to keep NaN's as NaN's!
print(col_fil_mat.shape)
col_fil_mat.head()

(3701, 108)


CREDIT_DESCRIPTION,ALBERT Fonds voor betere levenskwaliteit,Arjan Fonds,Arne Loosveldt Fonds,Bart Verbeeck Fonds,Bibliotheekfonds,Bieke Wittebolsfonds,Bone4Kids Fonds,Carpe Diem Fonds voor Diabetesonderzoek,Damiaan Fonds,Dominique Everaert Fonds,...,SOLFA Fonds,Science@Leuven Fund,Sequoia Fonds,Stefanie's Rozen Fonds,Vuylsteke-Flipts Fonds,WILL Fonds,Wilfried Martens Fonds,Zalmfonds,Zuster Jeanne Devos Fonds,project Magenta
PARTNER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200000058,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200000340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200000546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200000611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
200000612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Fund Similarity Matrix 

Useful for recommending funds to donors who have donated to only one fund to LUF. <br>
Find 5 most similar funds to a given fund and put it in a matrix. <br>

In [33]:
item_item_sim_mat = col_fil_mat.corr(method='pearson')#Basically Item-to-Item similarity matrix
print(item_item_sim_mat.shape)
item_item_sim_mat.head()

(108, 108)


CREDIT_DESCRIPTION,ALBERT Fonds voor betere levenskwaliteit,Arjan Fonds,Arne Loosveldt Fonds,Bart Verbeeck Fonds,Bibliotheekfonds,Bieke Wittebolsfonds,Bone4Kids Fonds,Carpe Diem Fonds voor Diabetesonderzoek,Damiaan Fonds,Dominique Everaert Fonds,...,SOLFA Fonds,Science@Leuven Fund,Sequoia Fonds,Stefanie's Rozen Fonds,Vuylsteke-Flipts Fonds,WILL Fonds,Wilfried Martens Fonds,Zalmfonds,Zuster Jeanne Devos Fonds,project Magenta
CREDIT_DESCRIPTION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ALBERT Fonds voor betere levenskwaliteit,1.0,-0.003861,-0.006388,-0.00937,0.019929,-0.007983,-0.015228,-0.010195,-0.003186,0.023638,...,-0.024557,-0.011185,-0.004155,-0.002725,-0.005721,-0.003673,-0.009766,-0.003038,-0.030551,-0.004812
Arjan Fonds,-0.003861,1.0,0.02797,-0.004081,0.006569,-0.003334,-0.006361,-0.004258,-0.001331,-0.002316,...,-0.010259,-0.004672,-0.001736,-0.004215,-0.00239,-0.001587,-0.004173,-0.001269,-0.017872,-0.00201
Arne Loosveldt Fonds,-0.006388,0.02797,1.0,-0.006751,-0.012692,-0.005473,-0.010524,-0.007045,-0.002201,-0.003832,...,-0.016921,0.003685,-0.002871,-0.006973,-0.003954,-0.002626,-0.006904,-0.002099,-0.022986,-0.003325
Bart Verbeeck Fonds,-0.00937,-0.004081,-0.006751,1.0,0.001623,-0.008436,0.001917,0.085452,-0.003367,-0.00586,...,-0.020539,0.018106,-0.004391,-0.010663,-0.006046,-0.00345,-0.010559,-0.003211,-0.025861,-0.005085
Bibliotheekfonds,0.019929,0.006569,-0.012692,0.001623,1.0,-0.023294,-0.035549,-0.014264,-0.009716,-0.016913,...,-0.024259,-0.021493,0.023123,0.003401,-0.00535,-0.010855,-0.017193,-0.009265,-0.053979,-0.006775


In [34]:
item_item_sim_mat['Mark Waer Fonds'].nlargest(4)
#'Fonds Cultureel en Historisch Erfgoed' & 'Fonds Transplantatie Onderzoek Mark Waer' are somewhat similar to 'Mark Waer Fonds'

CREDIT_DESCRIPTION
Mark Waer Fonds                             1.000000
Fonds Cultureel en Historisch Erfgoed       0.124247
Fonds Transplantatie Onderzoek Mark Waer    0.107176
Marc Vervenne Fonds                         0.094454
Name: Mark Waer Fonds, dtype: float64

In [35]:
#Fund similarity matrix is computed here
fund_cols = ['MOST_SIM_FUND_1','MOST_SIM_FUND_2','MOST_SIM_FUND_3','MOST_SIM_FUND_4','MOST_SIM_FUND_5']
top_5_similar_funds_df = pd.DataFrame(columns = fund_cols)
item_item_cols = list(item_item_sim_mat.columns.values)
for col in item_item_cols:
    df = pd.DataFrame([[item_item_sim_mat[col].nlargest(6).index[1],item_item_sim_mat[col].nlargest(6).index[2],
                        item_item_sim_mat[col].nlargest(6).index[3],item_item_sim_mat[col].nlargest(6).index[4],
                        item_item_sim_mat[col].nlargest(6).index[5]]],columns=fund_cols,index=[col])
    top_5_similar_funds_df = top_5_similar_funds_df.append(df)
top_5_similar_funds_df.to_csv('SQLDATA/PYTHON_OUTPUTS/top_5_similar_funds_df.csv', encoding='latin-1') #Save the matrix
print(top_5_similar_funds_df.shape)
top_5_similar_funds_df.head(10)

(108, 5)


Unnamed: 0,MOST_SIM_FUND_1,MOST_SIM_FUND_2,MOST_SIM_FUND_3,MOST_SIM_FUND_4,MOST_SIM_FUND_5
ALBERT Fonds voor betere levenskwaliteit,Leuvens kankerinstituut,Onderzoeksfonds Palliatieve Zorg,Fonds Cultureel en Historisch Erfgoed,Dominique Everaert Fonds,Olivia Hendrickx Research Fund
Arjan Fonds,Pieter Van Loon Fonds,Nadine de Beauffort Fonds,Arne Loosveldt Fonds,Leuvens kankerinstituut,Multidisciplinair Borstcentrum Leuven
Arne Loosveldt Fonds,Rondoufonds voor Duchenneonderzoek,Arjan Fonds,Leymah Gbowee Leadership Fund,OriGENE Fonds,Marc Vervenne Fonds
Bart Verbeeck Fonds,Carpe Diem Fonds voor Diabetesonderzoek,Kinderkankerfonds Leuven,Hardelingenfonds,Science@Leuven Fund,Healthy Heart Fund
Bibliotheekfonds,Fonds Herman Servotte,Fund of Friends for Cancer Research,Fonds I. De Wever Oncologie,Fonds Em. prof. dr. A.L. Baert,Fonds voor Inflammatoire Darmziekten
Bieke Wittebolsfonds,Multiple Sclerose Fonds,project Magenta,Fonds Respiratoire Oncologie,Mertens-Berx Fonds Auto-inflammatoire z.,Fonds Transplantoux
Bone4Kids Fonds,Fonds Vergote,Fonds Jouw Gezondheid,Kiemceltumoren van het ovarium,Fonds Inès Costa,Fonds Een hart voor ALS
Carpe Diem Fonds voor Diabetesonderzoek,Bart Verbeeck Fonds,Fonds voor Hartfalen en Harttransplantat,Eline4Kids fonds,Guy Molenaers Fonds,Fund Beta-Cell Research
Damiaan Fonds,LUMOS,Leymah Gbowee Leadership Fund,Fonds Kortrijk Sociaal en Internationaal,Fonds Avicenna,Zalmfonds
Dominique Everaert Fonds,WILL Fonds,Optimalisatie radio hoofd-en halstumoren,Ped IMID fonds,J De Wever Fonds Prostaatkankerpreventie,Fonds Kind en orgaantransplantatie


In [36]:
col_fil_data[['PARTNER','CREDIT_DESCRIPTION','WEIGHTED_RATING']].groupby('PARTNER').agg(
    {'WEIGHTED_RATING':[np.size,np.mean]}).describe()

Unnamed: 0_level_0,WEIGHTED_RATING,WEIGHTED_RATING
Unnamed: 0_level_1,size,mean
count,3701.0,3701.0
mean,2.550392,0.688502
std,1.578218,0.197125
min,2.0,0.000883
25%,2.0,0.558825
50%,2.0,0.69562
75%,3.0,0.833335
max,37.0,1.0


This means that there are 3701 donors who have donated to more than 2 different funds. <br>
On an average a donor donates to 2.55 funds. <br>
The average rating of a user for a particular fund is 0.688 (i.e, he donates to a fund 68.9% of their funding capacity). <br>

In [37]:
col_fil_data[['PARTNER','CREDIT_DESCRIPTION','WEIGHTED_RATING']].groupby('CREDIT_DESCRIPTION').agg(
    {'WEIGHTED_RATING':[np.size,np.mean]}).describe()

Unnamed: 0_level_0,WEIGHTED_RATING,WEIGHTED_RATING
Unnamed: 0_level_1,size,mean
count,108.0,108.0
mean,87.398148,0.631781
std,149.229294,0.14425
min,5.0,0.144299
25%,11.75,0.529191
50%,36.5,0.64085
75%,82.5,0.707466
max,914.0,0.968254


This means that there are 108 funds which are donated by more than 5 different donors. <br>
On an average a fund is donated by 87.39 different donors. <br>
On an average a fund gets 63.17% of the capacity of a donor. <br>

In [38]:
#Iterate over all algorithms to check the best algorithm for recommending funds
benchmark = []

col_fil_data = col_fil_data[(col_fil_data['PARTNER'].isin(donlist_donated_to_atl_n_diff_funds)) & 
                            (col_fil_data['CREDIT_DESCRIPTION'].isin(fundlist_donated_by_atl_n_diff_donors))]
col_fil_data.reset_index(drop=True,inplace=True)
print(col_fil_data.shape)
reader = reader.Reader(rating_scale=(0.0, 1.0))
data = Dataset.load_from_df(col_fil_data[['PARTNER','CREDIT_DESCRIPTION','WEIGHTED_RATING']],reader)
trainingSet = data.build_full_trainset()
testSet = trainingSet.build_anti_testset()

#Allowed values are cosine, msd, pearson, pearson_baseline.
sim_options = {'name': 'pearson_baseline','user_based': True}
recommender_algorithms = [SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(sim_options=sim_options), 
                          KNNBasic(sim_options=sim_options), KNNWithMeans(sim_options=sim_options), SVD(), SVDpp(),
                          KNNWithZScore(sim_options=sim_options), BaselineOnly(), CoClustering()] 
for algorithm in recommender_algorithms:
    #Perform cross validation
    results = cross_validate(algorithm, data, measures = ['RMSE','MAE'], cv = 5, verbose = False)
    
    #Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')


(9439, 7)
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_b

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,0.352684,0.308027,0.707385,0.304449
KNNBasic,0.359617,0.313691,0.727048,0.301951
BaselineOnly,0.365887,0.334638,0.012665,0.010148
SVDpp,0.366513,0.33068,0.836127,0.023534
SVD,0.373213,0.33432,0.43355,0.011218
SlopeOne,0.473213,0.378507,0.03307,0.012777
NormalPredictor,0.488442,0.393933,0.010236,0.010183
KNNWithMeans,0.49847,0.407,0.730409,0.277987
KNNWithZScore,0.506267,0.411908,0.804962,0.269847
NMF,0.525233,0.425809,0.655639,0.006973


KNNBaseline is the best model to produce recommendations, so use this model for recommending funds.<br>
Tune the KNNBaseline using either User-based filtering or Item-based filtering. <br>

In [39]:
#User_to_User similarity comparison
k_list = [10,20,30,40,50]
sim_options_list = [{'name': 'cosine','user_based': True},
                    {'name': 'msd','user_based': True},
                    {'name': 'pearson','user_based': True},
                    {'name': 'pearson_baseline','user_based': True}]
benchmark = []
for ki in k_list:
    for sim_options in sim_options_list:
        #Perform cross validation
        algorithm = KNNBaseline(k=ki,sim_options=sim_options)
        results = cross_validate(algorithm, data, measures = ['RMSE','MAE'], cv = 5, verbose = False)

        #Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]+"_"+str(ki)+"_"+sim_options['name']], index=['Algorithm']))
        benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd 

  sim = construction_func[name](*args)


Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine simil

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline_30_pearson_baseline,0.349747,0.305252,0.705457,0.257579
KNNBaseline_50_pearson_baseline,0.350989,0.30654,0.706157,0.297247
KNNBaseline_20_pearson,0.351469,0.309171,2.233294,0.277236
KNNBaseline_20_pearson_baseline,0.351582,0.306419,0.709139,0.310194
KNNBaseline_30_pearson,0.352726,0.309113,2.067002,0.257057
KNNBaseline_10_pearson_baseline,0.352824,0.307756,0.691547,0.23718
KNNBaseline_40_pearson_baseline,0.352975,0.307884,0.689642,0.263462
KNNBaseline_10_pearson,0.353605,0.309791,2.033332,0.241439
KNNBaseline_50_pearson,0.353712,0.310508,2.055911,0.268883
KNNBaseline_40_pearson,0.353732,0.310088,2.03586,0.268785


In [40]:
#Item_to_Item similarity comparison
k_list = [10,20,30,40,50]
sim_options_list = [{'name': 'cosine','user_based': False},
                    {'name': 'msd','user_based': False},
                    {'name': 'pearson','user_based': False},
                    {'name': 'pearson_baseline','user_based': False}]
benchmark = []
for ki in k_list:
    for sim_options in sim_options_list:
        #Perform cross validation
        algorithm = KNNBaseline(k=ki,sim_options=sim_options)
        results = cross_validate(algorithm, data, measures = ['RMSE','MAE'], cv = 5, verbose = False)

        #Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]+"_"+str(ki)+"_"+sim_options['name']], index=['Algorithm']))
        benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd 

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pears

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline_40_pearson_baseline,0.402198,0.339551,0.017494,0.01417
KNNBaseline_30_pearson_baseline,0.406902,0.344063,0.021492,0.01609
KNNBaseline_10_pearson_baseline,0.407793,0.345676,0.017122,0.016954
KNNBaseline_20_pearson_baseline,0.407937,0.344705,0.01563,0.01562
KNNBaseline_50_pearson_baseline,0.408017,0.344805,0.017498,0.016289
KNNBaseline_10_pearson,0.434502,0.362405,0.015419,0.018467
KNNBaseline_30_pearson,0.435422,0.362778,0.016497,0.013953
KNNBaseline_20_pearson,0.436574,0.364587,0.017353,0.019849
KNNBaseline_50_pearson,0.439128,0.365213,0.015532,0.026262
KNNBaseline_40_pearson,0.439424,0.366834,0.015622,0.015628


Clearly KNNBaseline with k=30 with pearson_baseline similarity and user_based similarity computation produced the best result.

In [41]:
#Get the top 5 recommendations for each donors. These are the fund recommendations which the donors haven't donated before.
sim_options = {'name': 'pearson_baseline','user_based': True}
knn = KNNBaseline(k=30, sim_options=sim_options)
knn.train(trainingSet)
predictions = knn.test(testSet)

def get_top5_recommendations(predictions, topN = 5):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

top5_recommendations = get_top5_recommendations(predictions)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [42]:
#Get the donor x fund_recommendation matrix.
#RF_i represents the predicted rating of the donors for the ith recommended fund.
donor_recommender_cols = ['FUND_1','FUND_2','FUND_3','FUND_4','FUND_5','RF_1','RF_2','RF_3','RF_4','RF_5']
donor_recommender_df = pd.DataFrame(columns = donor_recommender_cols)
for k,v in top5_recommendations.items():
    df = pd.DataFrame([[str(v[0][0]),str(v[1][0]),str(v[2][0]),str(v[3][0]),str(v[4][0]),
                        v[0][1],v[1][1],v[2][1],v[3][1],v[4][1]]],columns = donor_recommender_cols,index=[str(k)])
    donor_recommender_df = donor_recommender_df.append(df)
    
donor_recommender_df['PARTNER'] = donor_recommender_df.index
donor_recommender_df.reset_index(drop=True,inplace=True)
donor_recommender_df = donor_recommender_df[['PARTNER']+donor_recommender_cols]
print(donor_recommender_df.shape)
donor_recommender_df.head()

(3701, 11)


Unnamed: 0,PARTNER,FUND_1,FUND_2,FUND_3,FUND_4,FUND_5,RF_1,RF_2,RF_3,RF_4,RF_5
0,200000058,Fonds Lenaerts-Grimonprez,Fonds Transplantoux,Herman Van der Wee Fonds,Fonds I. De Wever Oncologie,Science@Leuven Fund,1.0,1.0,0.935482,0.881722,0.771073
1,200000340,Fonds Sofhea Cruh,Fonds Kind en orgaantransplantatie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Kinderkankerfonds Leuven,1.0,0.847492,0.837757,0.836099,0.827295
2,200000546,SOLFA Fonds,Kinderkankerfonds Leuven,KU Leuven Erfgoedfonds,Bibliotheekfonds,Fonds Roger Dillemans,1.0,1.0,1.0,0.878832,0.845214
3,200000611,Fonds Sofhea Cruh,Initiatieven Theologie en Religie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Hans en Els Vandamme Fonds,0.969495,0.825479,0.815346,0.813688,0.803096
4,200000612,Initiatieven Theologie en Religie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Hans en Els Vandamme Fonds,Fonds voor Inflammatoire Darmziekten,0.820252,0.810118,0.808461,0.797869,0.795805


In [43]:
#Some donors in the database are dead. But we have used this data to train the recommender system. 
donor_stats = col_fil_data[['PARTNER','min_T_ZPF','max_T_ZPF']]
donor_stats = donor_stats.drop_duplicates()
donor_stats.reset_index(drop=True,inplace=True)

donor_recommender_df = pd.merge(donor_stats,donor_recommender_df,on='PARTNER',how='inner')
print(donor_recommender_df.shape)

#To find the list of donors who are dead
but000 = pd.read_csv('sqldata/20190102_BUT000.csv',dtype = str,sep=';')
but000 = but000[but000['ZZOVERLEDEN'].isna()] #Only produces alive business partners
alive_donors = but000[['PARTNER']]

donor_recommender_df = pd.merge(donor_recommender_df,alive_donors,on='PARTNER',how='inner')
donor_recommender_df.to_csv('SQLDATA/PYTHON_OUTPUTS/donor_recommender_df.csv', encoding='latin-1') #Save the recommendations
print(donor_recommender_df.shape) #It seems like 130 people in the list have passed away
donor_recommender_df.head()

(3701, 13)
(3571, 13)


Unnamed: 0,PARTNER,min_T_ZPF,max_T_ZPF,FUND_1,FUND_2,FUND_3,FUND_4,FUND_5,RF_1,RF_2,RF_3,RF_4,RF_5
0,200000058,50.0,100000.0,Fonds Lenaerts-Grimonprez,Fonds Transplantoux,Herman Van der Wee Fonds,Fonds I. De Wever Oncologie,Science@Leuven Fund,1.0,1.0,0.935482,0.881722,0.771073
1,200000340,40.0,50.0,Fonds Sofhea Cruh,Fonds Kind en orgaantransplantatie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Kinderkankerfonds Leuven,1.0,0.847492,0.837757,0.836099,0.827295
2,200000546,15.0,100.0,SOLFA Fonds,Kinderkankerfonds Leuven,KU Leuven Erfgoedfonds,Bibliotheekfonds,Fonds Roger Dillemans,1.0,1.0,1.0,0.878832,0.845214
3,200000611,27.5,50.0,Fonds Sofhea Cruh,Initiatieven Theologie en Religie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Hans en Els Vandamme Fonds,0.969495,0.825479,0.815346,0.813688,0.803096
4,200000612,30.0,87.0,Initiatieven Theologie en Religie,Herman Van der Wee Fonds,Martine Goublomme Fund for NET Research,Hans en Els Vandamme Fonds,Fonds voor Inflammatoire Darmziekten,0.820252,0.810118,0.808461,0.797869,0.795805


It seems like 130 people in the list have passed away.

Let's calculate how much money LUF would receive from these fund recommendations over the next few years. <br>
Suppose if a donor donates to one of the 5 recommended funds. <br>

In [44]:
min_donation = donor_recommender_df['min_T_ZPF'].sum()
print('Minimum donation = '+str(min_donation))
max_donation = donor_recommender_df['max_T_ZPF'].sum()
print('Maximum donation = '+str(max_donation))
expected_donation = (donor_recommender_df['max_T_ZPF']*0.689).sum().round(2) 
#It was found that on an average a donor donates about 68.9% of their maximum capacity. (See above)
print('Expected donation = '+str(expected_donation))

Minimum donation = 524815.25
Maximum donation = 8344210.205
Expected donation = 5749160.83


So if a donor donates to either one of the five recommended funds, then the expected donation LUF would receive from the fund recommendations is <b>5,674,062.94</b> Euro from 3,571 donors over the next few years. <br>