In [36]:
import pandas as pd
import numpy as np
df = pd.read_csv('KAG_conversion_data.csv')
df.head()

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1


### Insert here elements for a unique ID


In [37]:
uniqueID_list =['ad_id','xyz_campaign_id','fb_campaign_id','age','gender','interest']

### Turn all elements into numeric features and check if a linear model could work

In [38]:
def numeric_variables(df,x):
    variables = list(df[x].unique())
    variables_count = list(range(len(df[x].unique())))
    audiences= dict(zip(variables,variables_count))
    df[f'{x}_n'] = df[x].map(audiences)

In [39]:
df.interest.unique()

array([ 15,  16,  20,  28,  29,  27,  31,   7,  30,  24,  21,  32,  18,
        63,  65,  25,  10,  19,  26,  36,  23,  64,  22,   2,  66, 100,
       101, 102, 103, 105, 107, 110, 111, 112, 113, 108, 109, 114, 104,
       106], dtype=int64)

In [40]:
df.xyz_campaign_id.unique()

array([ 916,  936, 1178], dtype=int64)

In [41]:
numeric_variables(df,'age')
numeric_variables(df,'gender')


In [42]:
df.columns

Index(['ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'age', 'gender',
       'interest', 'Impressions', 'Clicks', 'Spent', 'Total_Conversion',
       'Approved_Conversion', 'age_n', 'gender_n'],
      dtype='object')

In [43]:
X = df[['interest','age_n', 'gender_n','Impressions','Clicks','Spent']]
y= df['Approved_Conversion']

In [44]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt

#the shape of your x train needs to be the same as the sahpe of your xtest / 

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

lr = LinearRegression().fit(Xtrain, ytrain)

print("Coefficients:", lr.coef_)
print("Intercept   :", lr.intercept_)

print("train score :", lr.score(Xtrain, ytrain))
print("test score  :", lr.score(Xtest, ytest))

Coefficients: [-1.13626673e-03 -1.00026842e-01 -7.53365709e-02  7.66385469e-06
 -6.15278939e-03 -1.08854321e-02]
Intercept   : 0.4659855873612042
train score : 0.4736447790411031
test score  : 0.6859448511864739


In [45]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0,
                              n_estimators=100)
regr.fit(X, y)  
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
print(regr.score(Xtest, ytest))

0.6669017852052879


### Linear predictor don't look like a valuable option, reason is that there are no values with which a linear relation could be established. Moreover, impressions and clicks are somehow interdpendent with conversions and for this reason should not be used. Let's try with a Logistic approach

In [46]:
def index_a(df,x,m):
    df['y'] = df[x].shift(1)

    df['a'] = (df['y'] != df[x]).astype(int)

    df.reset_index(inplace=True)

    transition_points = list(df[df['a'] == 1].index)

    cycle_list = []
    for i in range(len(transition_points)):
        try:
            cycle = range(0, (transition_points[i+1] - transition_points[i]))

        except IndexError:
            cycle = range(0, len(df)-len(cycle_list)) #think of a better way to find this number without hardcoding it!

        cycle_list += cycle

    df['f'] = cycle_list
    df['ID_final']= df[x]+'-'+df['f'].astype(str)
    df[m]= 1
    #df.drop(['y','a','f'], axis=1, inplace=True)# why is it not dropping columns?

def new_df_db(df,x):
    y = df.loc[df.index.repeat(df[x])] #how can I insert it f'{x}_base_df' 
    return y



def unique_ID(df,x):
    df['uniqueID'] = df[x].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

### Apply functions to reshape the dataframe into a single user based one

In [47]:
unique_ID(df,uniqueID_list)

In [49]:
Visits_tab = new_df_db(df,'Clicks')
conversion_tab = new_df_db(df,'Total_Conversion')
purchase_tab = new_df_db(df,'Approved_Conversion')

In [50]:
#need a reference column for the process to work
index_a(Visits_tab,'uniqueID','Clicks')
index_a(purchase_tab,'uniqueID','purchase')
index_a(conversion_tab,'uniqueID','addtocart')

Unnamed: 0,index,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,age_n,gender_n,uniqueID,ID_final
0,0,708746,916,103916,30-34,M,15,7350,1,1.43,0,0,708746_916_103916_30-34_M_15,708746_916_103916_30-34_M_15-0
1,1,708749,916,103917,30-34,M,16,17861,1,1.82,0,0,708749_916_103917_30-34_M_16,708749_916_103917_30-34_M_16-0
2,1,708749,916,103917,30-34,M,16,17861,1,1.82,0,0,708749_916_103917_30-34_M_16,708749_916_103917_30-34_M_16-1
3,3,708815,916,103928,30-34,M,28,4259,1,1.25,0,0,708815_916_103928_30-34_M_28,708815_916_103928_30-34_M_28-0
4,4,708818,916,103928,30-34,M,28,4133,1,1.29,0,0,708818_916_103928_30-34_M_28,708818_916_103928_30-34_M_28-0


### Drop not needed columns from dataframes created and remerge them

In [None]:
Visits_tab = Visits_tab.drop(['Approved_Conversion', 'Total_Conversion','y','a','f'],axis=1)
purchase_tab = purchase_tab[['ID_final','Approved_Conversion']]
conversion_tab =conversion_tab[['ID_final','Total_Conversion']]
Visits_tab.head()

In [51]:
df_d =pd.merge(Visits_tab,purchase_tab, how='left', on='ID_final')
df_final=pd.merge(df_d,conversion_tab,how='left', on='ID_final')

In [52]:
df_final.fillna(0,inplace=True)
df_final.head()

Unnamed: 0,index,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,age_n,gender_n,uniqueID,ID_final,Approved_Conversion,Total_Conversion
0,0,708746,916,103916,30-34,M,15,7350,1,1.43,0,0,708746_916_103916_30-34_M_15,708746_916_103916_30-34_M_15-0,1.0,2.0
1,1,708749,916,103917,30-34,M,16,17861,1,1.82,0,0,708749_916_103917_30-34_M_16,708749_916_103917_30-34_M_16-0,0.0,2.0
2,1,708749,916,103917,30-34,M,16,17861,1,1.82,0,0,708749_916_103917_30-34_M_16,708749_916_103917_30-34_M_16-1,0.0,2.0
3,3,708815,916,103928,30-34,M,28,4259,1,1.25,0,0,708815_916_103928_30-34_M_28,708815_916_103928_30-34_M_28-0,0.0,1.0
4,4,708818,916,103928,30-34,M,28,4133,1,1.29,0,0,708818_916_103928_30-34_M_28,708818_916_103928_30-34_M_28-0,1.0,1.0


### Apply a simple Logistic model

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
from sklearn.preprocessing import MinMaxScaler
#the shape of your x train needs to be the same as the sahpe of your xtest / 
X = df_final[['age_n','gender_n','interest']]
y = df_final['Total_Conversion']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=50)
lr=LogisticRegression()
lr.fit(Xtrain, ytrain)
#scaler = MinMaxScaler()
#scaler.fit(Xtrain) #applying the fit method (which is basically applying)
#Xtrain_scaled_new = scaler.transform(Xtrain)
#Xtrain_scaled_new

print("train score :", lr.score(Xtrain, ytrain))
print("test score  :", lr.score(Xtest, ytest))



train score : 0.9200642839674388
test score  : 0.9216097254244393


### Sweet, That's something we can work with ;)