In [1]:
import numpy as np, matplotlib as mpl, matplotlib.pyplot as plt, pandas as pd
import seaborn as sns, math, os, warnings

In [2]:
df_data = pd.read_csv('uplift_synthetic_data_100trials.csv')
df_model = df_data.copy()

In [3]:
# Информация о датасете

df_model.head()
df_model.info()

df_model.describe(include=object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 43 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   Unnamed: 0                  1000000 non-null  int64  
 1   trial_id                    1000000 non-null  int64  
 2   treatment_group_key         1000000 non-null  object 
 3   conversion                  1000000 non-null  int64  
 4   control_conversion_prob     1000000 non-null  float64
 5   treatment1_conversion_prob  1000000 non-null  float64
 6   treatment1_true_effect      1000000 non-null  float64
 7   x1_informative              1000000 non-null  float64
 8   x2_informative              1000000 non-null  float64
 9   x3_informative              1000000 non-null  float64
 10  x4_informative              1000000 non-null  float64
 11  x5_informative              1000000 non-null  float64
 12  x6_informative              1000000 non-null  float64
 13

Unnamed: 0,treatment_group_key
count,1000000
unique,2
top,control
freq,500000


In [4]:
object_cols = [col for col in df_model.columns if df_model[col].dtype == "object"]
for obj in object_cols:
    print('\n', obj)
    for unique in df_model[obj].unique():
        print("{} {}".format(unique,sum(df_model[obj] == unique)))


 treatment_group_key
control 500000
treatment1 500000


## Предобработка данных

In [5]:
df_model = df_model.rename(columns={'conversion': 'target'})
df_model.treatment_group_key = df_model.treatment_group_key.replace({'control': 0, 'treatment1': 1})

In [6]:
df_model = pd.get_dummies(df_model)

In [7]:
df_model_control = df_model.copy()
df_model_treatment = df_model.copy().loc[df_model.treatment_group_key >=0].reset_index(drop=True)

In [8]:
def declare_tc(df:pd.DataFrame):
    #CN:
    df['target_class'] = 0 
    #CR:
    df.loc[(df.treatment_group_key == 0) & (df.target != 0),'target_class'] = 1 
    #TN:
    df.loc[(df.treatment_group_key != 0) & (df.target == 0),'target_class'] = 2 
    #TR:
    df.loc[(df.treatment_group_key != 0) & (df.target != 0),'target_class'] = 3 
    return df

In [9]:
df_model_control = declare_tc(df_model_control)
df_model_treatment = declare_tc(df_model_treatment)

## Uplift Modeling

In [10]:
# Functions for Uplift
from sklearn.model_selection import train_test_split
import xgboost as xgb
def uplift_split(df_model:pd.DataFrame):
    """Train-Test Split
    """
    X = df_model.drop(['target','target_class'],axis=1)
    y = df_model.target_class
    X_train, X_test, \
    y_train, y_test  = train_test_split(X,
                                       y,
                                       test_size=0.3,
                                       random_state=42,
                                       stratify=df_model['treatment_group_key'])
    return X_train,X_test, y_train, y_test


def uplift_model(X_train:pd.DataFrame,
                 X_test:pd.DataFrame,
                 y_train:pd.DataFrame,
                 y_test:pd.DataFrame):
    """Using XGB to get the uplift score
    """
    # Create new dataframe
    result = pd.DataFrame(X_test).copy()    
    # Fit the model
    uplift_model \
    = xgb.XGBClassifier().fit(X_train.drop('treatment_group_key', axis=1), y_train)
    
    
    # Predict using test-data
    uplift_proba \
    = uplift_model.predict_proba(X_test.drop('treatment_group_key', axis=1))
    result['proba_CN'] = uplift_proba[:,0] 
    result['proba_CR'] = uplift_proba[:,1] 
    result['proba_TN'] = uplift_proba[:,2] 
    result['proba_TR'] = uplift_proba[:,3]
    result['uplift_score'] = result.eval('\
    proba_CN/(proba_CN+proba_CR) \
    + proba_TR/(proba_TN+proba_TR) \
    - proba_TN/(proba_TN+proba_TR) \
    - proba_CR/(proba_CN+proba_CR)')  
    # Put the result 
    result['target_class'] = y_test
    return result


def uplift(df_model:pd.DataFrame):
    """Combine the split and Modeling function|
    """
    X_train, X_test, y_train, y_test = uplift_split(df_model)
    result = uplift_model(X_train, X_test, y_train, y_test)
    return result

In [11]:
# Запускаем uplift функцию

control_uplift = uplift(df_model_control)
treatment_uplift = uplift(df_model_treatment)