In [4]:
# Home assignment from Wirecard.
# received 16.01 
# Notebook for prototyping

In [283]:
#Import packages
import pandas as pd
import numpy as np
import scipy as sp
from scipy.special import gamma
from scipy.special import gammaln as lnG
from scipy import optimize


import datetime as dt

# Task 1

In [31]:
#load the data
df = pd.read_csv("all_transactions.csv",names=['ID','date'])
print(df.shape)
df.head()

(6919, 2)


Unnamed: 0,ID,date
0,1,19970101
1,1,19970118
2,1,19970802
3,1,19971212
4,2,19970101


## Prepare the dataset

In [32]:
# Remove duplications
df_unique = df.drop_duplicates()
print('Number of unique samples: {}'.format(len(df_unique)))
print('Number of unique customers: {}'.format(len(df_unique.ID.drop_duplicates())))

Number of unique samples: 6696
Number of unique customers: 2357


In [33]:
# train-validation split
split_date_b = 19970101 # Jan 1997
split_date_e = 19971001 # Oct 1997
#query = 'date >= {} and date < {}'.format(split_date_b,split_date_e)
#query = (df_unique.date >= split_date_b) & df_unique.date < split_date_e
df_train = df_unique[(df_unique.date >= split_date_b) & (df_unique.date < split_date_e)] # training part
#df_train = df_unique.query(query) # training part
df_val = df_unique[df_unique.date >= split_date_e] # rest is our validation part
print('Training   data size: {}'.format(df_train.shape))
print('Validation data size: {}'.format(df_val.shape))

Training   data size: (4814, 2)
Validation data size: (1882, 2)


In [34]:
# save the .csv file
df_train.reset_index(drop=True).to_csv('cal_period_transactions.csv')

# Task 2

In [35]:
# read the .csv file 
df = pd.read_csv("cal_period_transactions.csv",index_col='Unnamed: 0')
df.shape

(4814, 2)

## Parameters calculation 

### Convert to a proper datetime format

In [36]:
df['datetime'] = pd.to_datetime(df['date'],format='%Y%m%d')

### x: number of transactions done by customer

In [37]:
df['x'] = df.groupby('ID')['ID'].transform(lambda s: s.count() - 1)
df.head()

Unnamed: 0,ID,date,datetime,x
0,1,19970101,1997-01-01,2
1,1,19970118,1997-01-18,2
2,1,19970802,1997-08-02,2
3,2,19970101,1997-01-01,1
4,2,19970113,1997-01-13,1


### tx: duration in weeks between customer's last and first transaction

In [38]:
# create temp df with first and last date columns
df_temp = df.datetime.groupby(df['ID']).agg(['first','last'])

In [39]:
# find the difference between the transactions 
df_temp['tx_days'] = (df_temp['last'] - df_temp['first'])
# convert it to weeks
df_temp['tx'] = df_temp['tx_days']/np.timedelta64(1,'W')
# round to 2 decimals as in the example
df_temp['tx'] = df_temp['tx'].round(2)

In [40]:
df_temp.head()

Unnamed: 0_level_0,first,last,tx_days,tx
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1997-01-01,1997-08-02,213 days,30.43
2,1997-01-01,1997-01-13,12 days,1.71
3,1997-01-01,1997-01-01,0 days,0.0
4,1997-01-01,1997-01-01,0 days,0.0
5,1997-01-01,1997-01-01,0 days,0.0


### T: Duration in weeks between end of calibration period and the first customer's transaction 

In [41]:
df_temp['T_days'] = dt.datetime.strptime(str(split_date_e), '%Y%m%d') - df_temp['first'] - np.timedelta64(1,'D')# -1 to take the last day into account

In [42]:
df_temp['T'] = (df_temp['T_days'] / np.timedelta64(1,'W')).round(2)

In [43]:
df_temp.head(5)

Unnamed: 0_level_0,first,last,tx_days,tx,T_days,T
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1997-01-01,1997-08-02,213 days,30.43,272 days,38.86
2,1997-01-01,1997-01-13,12 days,1.71,272 days,38.86
3,1997-01-01,1997-01-01,0 days,0.0,272 days,38.86
4,1997-01-01,1997-01-01,0 days,0.0,272 days,38.86
5,1997-01-01,1997-01-01,0 days,0.0,272 days,38.86


### Merge into the initial DF

In [44]:
df = pd.merge(df,df_temp.drop(columns=['first','last','tx_days','T_days']),on=['ID'])

In [45]:
df.head()

Unnamed: 0,ID,date,datetime,x,tx,T
0,1,19970101,1997-01-01,2,30.43,38.86
1,1,19970118,1997-01-18,2,30.43,38.86
2,1,19970802,1997-08-02,2,30.43,38.86
3,2,19970101,1997-01-01,1,1.71,38.86
4,2,19970113,1997-01-13,1,1.71,38.86


### Prepare the output df

In [46]:
# only unique customers should be stored
df_out = df.drop_duplicates(subset=['ID'])

In [47]:
df_out.head()

Unnamed: 0,ID,date,datetime,x,tx,T
0,1,19970101,1997-01-01,2,30.43,38.86
3,2,19970101,1997-01-01,1,1.71,38.86
5,3,19970101,1997-01-01,0,0.0,38.86
6,4,19970101,1997-01-01,0,0.0,38.86
7,5,19970101,1997-01-01,0,0.0,38.86


In [48]:
# prepare the output df
df_out = df_out.reset_index().drop(columns=['date','datetime','index'])
# rename ID column
df_out.rename(columns={'ID': 'Customer ID'},inplace=True)
# Save into .csv file
df_out.to_csv('summary_customers.csv')

In [49]:
df_out.head()

Unnamed: 0,Customer ID,x,tx,T
0,1,2,30.43,38.86
1,2,1,1.71,38.86
2,3,0,0.0,38.86
3,4,0,0.0,38.86
4,5,0,0.0,38.86


In [50]:
# clean the memory
del df_temp

To be omplemented
* check whether dates are in chronological order, i.e. avoid negative values 

In [51]:
df_test = pd.read_csv('summary_customers.csv')

# Task 3

The aim of this assignment is to fit a model to the data, so that predictions about the customers’ purchase behavior in future can be made. The model is defined by four parameters namely r, α, a, b and they are always greater than 0. For fitting the model (i.e. finding the parameters r, α, a, b), Maximum Likelihood Estimation (MLE) can be used.

In [73]:
# load the .csv file
df = pd.read_csv('summary_customers.csv',index_col='Unnamed: 0')
df.head()

Unnamed: 0,Customer ID,x,tx,T
0,1,2,30.43,38.86
1,2,1,1.71,38.86
2,3,0,0.0,38.86
3,4,0,0.0,38.86
4,5,0,0.0,38.86


In [161]:
#implementing loss function

r,alpha,a,b = 0.1,0.1,0.1,2

def delta(x):
    return (x>0).astype(int)
#    if(x>0): return 1
#    else: return 0

def loss(r,alpha,a,b,
        x,tx,T):
    lnA1 = lnG(r+x) + r*np.log(alpha) - lnG(r)
    lnA2 = lnG(a+b) + lnG(b+x) - lnG(b) - lnG(a+b+x)
    lnA3 = -(r+x)*np.log(alpha+T)
    lnA4 = np.log(alpha) - np.log(b+x-1) - (r+x)*np.log(alpha + tx)
    
    out = lnA1 + lnA2 + np.log(np.exp(lnA3) + delta(x)*np.exp(lnA4))

def obj_fi(df,r,alpha,a,b):
    lnA1 = lnG(r+df.x) + r*np.log(alpha) - lnG(r)
    print('1: ',lnA1.isnull().values.any())
    lnA2 = lnG(a+b) + lnG(b+df.x) - lnG(b) - lnG(a+b+df.x)
    print('2: ',lnA2.isnull().values.any())
    lnA3 = -(r+df.x)*np.log(alpha+df['T'])
    print('3: ',lnA3.isnull().values.any())
    lnA4 = np.log(alpha) - np.log(b+df.x-1) - (r+df.x)*np.log(alpha + df.tx)
    print('4: ',lnA4.isnull().values.any())
    out = (lnA1 + lnA2 + np.log(np.exp(lnA3) + delta(df.x)*np.exp(lnA4))).isnull()
    print('l: ',(np.log(np.exp(lnA3) + delta(df.x)*np.exp(lnA4))).isnull().values.any())
    print('NANS:',df[out],len(df[out]))
    return lnA1 + lnA2 + np.log(np.exp(lnA3) + delta(df.x)*np.exp(lnA4))

def obj_f(df,r,alpha,a,b):
    N = len(df)
    ll = obj_fi(df,r,alpha,a,b)
    
    return ll.sum() * (-1) / N

In [162]:
print(obj_f(df,r,alpha,a,b))

1:  False
2:  False
3:  False
4:  False
l:  False
NANS: Empty DataFrame
Columns: [Customer ID, x, tx, T]
Index: [] 0
4.382198327848188


In [405]:
# Implementing in the form of class
class CustomFunction:
    def __init__(self,num_iter=5, verbose=False):
        self.num_iter = num_iter
        self.verbose = verbose
        
        #initialize parameters
        self.__initialize_par()
        
    def __obj_f(self, params, df):
        #update parameters
        self.__update_params(params)
        #normalise alpha
        self.alpha*=self.norm
        # protection from negative parameters
        if self.r <=0 or self.b <=0 or self.alpha <= 0 or self.a <= 0:
            return np.inf
        N = len(df)
        obj_i = self.__loss(df)
        out = obj_i.sum() * (-1) / N
        #print(' NLL: ',out,' Pars: r = ',self.r,' alpha = ',self.alpha,' a = ',self.a,' b = ',self.b)
        
        return out
    
    def __loss(self, df):
        lnA1 = lnG(self.r+df.x) + self.r*np.log(self.alpha) - lnG(self.r)
        lnA2 = lnG(self.a+self.b) + lnG(self.b+df.x) - lnG(self.b) - lnG(self.a+self.b+df.x)
        lnA3 = -(self.r+df.x)*np.log(self.alpha+df['T'])
        lnA4 = np.log(self.alpha) - np.log(self.b+df.x-1) - (self.r+df.x)*np.log(self.alpha + df.tx)
        deltaA4 = self.__deltaA4(df.x,np.exp(lnA4))
        out = lnA1 + lnA2 + np.log(np.exp(lnA3) + deltaA4)
#         if self.b < 1.0000001:
#             print(' dA4: ',deltaA4)
        return out
    
    def __deltaA4(self,x,A4):
        """function to change NaN -> 0 at A4"""
        A4[x <= 0] = 0
        return A4
    
    def __update_params(self, new_params):
        """ Update model parameters."""
        self.r = new_params[0].copy()
        self.alpha = new_params[1].copy()
        self.a = new_params[2].copy()
        self.b = new_params[3].copy()
    
    def __initialize_par(self):
        # initialise with Gaus(1,0.05)
        self.r, self.alpha, self.a, self.b = np.random.normal(1, 0.05,4)
    
    def set_r(r):
        if r <= 0: 
            raise ValueError('r should be positive')
        self.r = r
        
    def set_alpha(alpha):
        if alpha <= 0: 
            raise ValueError('alpha should be positive')
        self.alpha = alpha
    
    def set_a(a):
        if a <= 0: 
            raise ValueError('a should be positive')
        self.a = a
        
    def set_b(b):
        if b <= 0: 
            raise ValueError('b should be positive')
        self.b = b
    
    def set_parameters(r, alpha, a, b):
        self.set_r(r), self.set_alpha(alpha), self.set_a(a), self.set_b(b)
    
    def fit(self, df, minimizer = 'Nelder-Mead'):
        
        norm_df = self.__normalise_data(df)

        for i in range(1,self.num_iter+1):
            print('Step ',i)
            initial_pars = [self.r,self.alpha,self.a,self.b]
            print('Initialisation: r = ',self.r, ' alpha = ',self.alpha, ' a = ',self.a, ' b = ',self.b)
            self.res = sp.optimize.minimize(self.__obj_f,x0 = initial_pars,args=(norm_df), method=minimizer) 
            self.print_final_results()
    
    def __normalise_data(self,df):
        """function to normnalise parameters tx and T
        """
        df_temp = df.copy()
        self.__norm_sf(df)
        df_temp['tx']*=self.norm
        df_temp['T']*=self.norm
        return df_temp
    
    def __norm_sf(self,df):
        """function to compute the normalisation factor
        """
        max_T = df['T'].max()
        self.norm = 10./ max_T
    
    def print_final_results(self):
        """function print final results
        """
        print('*********Fit Success: ',self.res.success,'*********')
        print('Fit status',self.res.status)
        print('NLL: ',self.res.fun)
        print('Optimized parameters: r = ',self.r, ' alpha = ',self.alpha, ' a = ',self.a, ' b = ',self.b)
        if self.verbose:
            print(self.res.message)


In [406]:
#np.random.seed(1)

model = CustomFunction(num_iter=5)
model.fit(df,minimizer='L-BFGS-B')
model.print_final_results()

Step  1
Initialisation: r =  1.006091063549572  alpha =  1.056474195395596  a =  1.0599458939950754  b =  1.0092578208741971
*********Fit Success:  True *********
NLL:  2.6702617559153983
Optimized parameters: r =  0.10114228876708076  alpha =  0.47115812418405345  a =  0.35961769442661784  b =  1.2010818402140624
b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
Step  2
Initialisation: r =  0.10114228876708076  alpha =  0.47115812418405345  a =  0.35961769442661784  b =  1.2010818402140624
*********Fit Success:  True *********
NLL:  2.9180056382264246
Optimized parameters: r =  0.10114228876708076  alpha =  0.12124501394340027  a =  0.35961769442661784  b =  1.2010818502140623
b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
Step  3
Initialisation: r =  0.10114228876708076  alpha =  0.12124501394340027  a =  0.35961769442661784  b =  1.2010818502140623
*********Fit Success:  True *********
NLL:  3.142724857520198
Optimized parameters: r =  0.10114228876708076  alpha =  0.0312004667

In [402]:
np.nan*0

nan