In [2]:
# !pip install lightgbm
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

import bytedtqs
from pytqs import tqs

mpl.rcParams['figure.dpi'] = 300
sns.set_style("whitegrid")
pd.options.display.max_columns = None
%config Completer.use_jedi = False

## Data Cleaning

In [3]:
appId = "eL38NWjpAMXlC0ifCoo71wk27zE6CS9YQismEcwVH5oNKbJf"
appKey = "VeeOKtXLxjGWE2OyIGkuqWRT33isyxtHH1xlxakeFzId6SMB"
user_id = 'wuweiwei.1012'

sql_txt = """ 

set tqs.query.result.max.lines = 1000000000;

select * from dm_eo_test.new_fm_new_user limit 1000000000
"""

client = tqs.TQSClient(appId=appId, appKey=appKey)
job = client.execute_query(username = user_id , sql = sql_txt)
data = pd.read_csv(job['resultUrl'])

df_raw = data

[2021-11-04 18:10:59,277] - [INFO] - tqs_task_id: 474237484, engine: , status: Created, progress : Submitting
[2021-11-04 18:11:01,304] - [INFO] - tqs_task_id: 474237484, execute id: tqs_474237484-1636020659537
[2021-11-04 18:11:01,305] - [INFO] - application tracking url: http://presto-history.byted.org/proxy/20211104_101059_00320_mnj82_default
[2021-11-04 18:11:01,306] - [INFO] - tqs_task_id: 474237484, engine: Presto, status: Processing, progress : Accepted
[2021-11-04 18:11:03,334] - [INFO] - tqs_task_id: 474237484, engine: Presto, status: Processing, progress : Accepted
[2021-11-04 18:11:05,385] - [INFO] - tqs_task_id: 474237484, engine: Presto, status: Completed, progress : Accepted


In [4]:
df_raw.head()

Unnamed: 0,user_id,install_date,new_dt,gap_days,enter_group,occupation,first_milestone_name,city_level,age,edu,gender,channel_group,study_mins,aim_module_user,study_dur_next
0,4934727355,2021-11-01,2021-11-01,0,0,unknown,,一线,31-40,,male,Apple Store,0,0,0.0
1,4934727355,2021-11-01,2021-11-02,1,0,unknown,,一线,31-40,,male,Apple Store,0,0,0.0
2,4934727355,2021-11-01,2021-11-03,2,0,unknown,,一线,31-40,,male,Apple Store,0,0,
3,86207305934,2021-11-01,2021-11-01,0,0,unknown,,,31-40,,male,Apple Store,0,0,0.0
4,86207305934,2021-11-01,2021-11-02,1,0,unknown,,,31-40,,male,Apple Store,0,0,0.0


In [5]:
# fill na
category_cols = []

for c in df_raw.columns:
    if df_raw[c].dtype=='O':
        df_raw[c] = df_raw[c].fillna('unknown')
        category_cols.append(c)
    else:
        df_raw[c] = df_raw[c].fillna(0)

In [6]:
df_raw.groupby(['aim_module_user']).size()

aim_module_user
0    123570
1      1735
dtype: int64

In [20]:
# add cum status
df_sorted = df_raw\
    .sort_values(by=['user_id', 'gap_days'])
df_sorted['cum_var'] = np.where(df_sorted\
    .groupby(['user_id'])['aim_module_user'].cumsum() > 0, 1, 0)
# df_sorted = df_raw

In [21]:
# select training data
# _gap_days = 3
_x = [
    'occupation',
    'first_milestone_name',
    'edu',
    'channel_group',
    'gender',
    'age',
    'city_level',
    'cum_var',
    'study_mins',
    'enter_group'
]
categorial_x = [
    'age',
    'first_milestone_name',
    'edu',
    'occupation',
    'city_level',
    'gender',
    'channel_group',
]
_y = 'study_dur_next'

df_train = df_sorted.reset_index()

## Standardizing

In [25]:
macth_summary = pd.DataFrame(columns = ["gap_days", "auc", "test", "control", "uplift", "test_st", "control_st"])

for _gap_days in range(0,5):
    df_train = df_sorted[lambda x: (x['gap_days']==_gap_days)].reset_index()
    
    # x & y
    train_x = df_train[_x]
    train_y = np.where(df_train[_y] > 0, 1, 0)

    # --------------- SEP ---------------
    # label encoding
    label_encoders = {}
    for c in categorial_x:
        encoder = LabelEncoder()
        train_x[c] = encoder.fit_transform(df_train[c].values)
        label_encoders[c] = encoder

    # --------------- SEP ---------------
    # counter-factual data
    train_x_cf_treated = train_x.assign(cum_var=lambda x: 1)
    train_x_cf_untreated = train_x.assign(cum_var=lambda x: 0)
    
    # training classifier
    mod = LGBMClassifier()
    mod.fit(train_x, train_y)
    
    y_pred_train = mod.predict(train_x)

    # predict counter-factual results
    pred_cf_treated = mod.predict_proba(train_x_cf_treated)[:,1]
    pred_cf_untreated = mod.predict_proba(train_x_cf_untreated)[:,1]
    
    # counter-factual mean diff
    mean_t = pred_cf_treated.mean()
    mean_u = pred_cf_untreated.mean()
    
    # actual mean diff
    df_stats = df_train.groupby(['cum_var']).apply(lambda x: pd.Series({
        'user_cnt': x.shape[0],
        'retention_rate': (x[_y]>0).mean()
    })).reset_index()
    
    macth_summary = macth_summary.append({"gap_days": _gap_days , \
                                          "auc": "{0:0.4f}". format(accuracy_score(train_y, y_pred_train)), \
                                          "test": "{:.2%}".format(mean_t), \
                                          "control": "{:.2%}".format(mean_u), \
                                          "uplift": "{:.2%}".format(mean_t - mean_u),\
                                          "test_st": "{:.2%}".format(df_stats.iloc[1,2]),\
                                          "control_st": "{:.2%}".format(df_stats.iloc[0,2])\
                                         }, ignore_index = True)


macth_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Unnamed: 0,gap_days,auc,test,control,uplift,test_st,control_st
0,0,0.8998,18.69%,17.52%,1.17%,35.19%,16.04%
1,1,0.9341,15.29%,12.17%,3.13%,31.56%,11.00%
2,2,0.9478,11.74%,9.91%,1.84%,26.60%,8.68%
3,3,0.9532,12.52%,8.37%,4.14%,24.42%,7.24%
4,4,0.9666,10.41%,6.53%,3.87%,21.51%,5.59%


In [29]:
    df_train = df_sorted.reset_index()
    
    # x & y
    train_x = df_train[_x]
    train_y = np.where(df_train[_y] > 0, 1, 0)

    # --------------- SEP ---------------
    # label encoding
    label_encoders = {}
    for c in categorial_x:
        encoder = LabelEncoder()
        train_x[c] = encoder.fit_transform(df_train[c].values)
        label_encoders[c] = encoder

    # --------------- SEP ---------------
    # counter-factual data
    train_x_cf_treated = train_x.assign(cum_var=lambda x: 1)
    train_x_cf_untreated = train_x.assign(cum_var=lambda x: 0)
    
    # training classifier
    mod = LGBMClassifier()
    mod.fit(train_x, train_y)
    
    y_pred_train = mod.predict(train_x)

    # predict counter-factual results
    pred_cf_treated = mod.predict_proba(train_x_cf_treated)[:,1]
    pred_cf_untreated = mod.predict_proba(train_x_cf_untreated)[:,1]
    
    # counter-factual mean diff
    mean_t = pred_cf_treated.mean()
    mean_u = pred_cf_untreated.mean()
    
    # actual mean diff
    df_stats = df_train.groupby(['cum_var']).apply(lambda x: pd.Series({
        'user_cnt': x.shape[0],
        'retention_rate': (x[_y]>0).mean()
    })).reset_index()
    
    macth_summary = macth_summary.append({"gap_days": 6 , \
                                          "auc": "{0:0.4f}". format(accuracy_score(train_y, y_pred_train)), \
                                          "test": "{:.2%}".format(mean_t), \
                                          "control": "{:.2%}".format(mean_u), \
                                          "uplift": "{:.2%}".format(mean_t - mean_u),\
                                          "test_st": "{:.2%}".format(df_stats.iloc[1,2]),\
                                          "control_st": "{:.2%}".format(df_stats.iloc[0,2])\
                                         }, ignore_index = True)


macth_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,gap_days,auc,test,control,uplift,test_st,control_st
0,0,0.8998,18.69%,17.52%,1.17%,35.19%,16.04%
1,1,0.9341,15.29%,12.17%,3.13%,31.56%,11.00%
2,2,0.9478,11.74%,9.91%,1.84%,26.60%,8.68%
3,3,0.9532,12.52%,8.37%,4.14%,24.42%,7.24%
4,4,0.9666,10.41%,6.53%,3.87%,21.51%,5.59%
5,6,0.9295,12.01%,9.56%,2.46%,24.24%,8.40%


## IP Weighting

In [27]:
# x & y
train_x = df_train[_x]
train_y = df_train[_y]

# --------------- SEP ---------------
# one-hot encoding
encoder = OneHotEncoder(sparse=False)  

encoded_df = pd.DataFrame(
    encoder.fit_transform(train_x[categorial_x]), 
    columns=encoder.get_feature_names(categorial_x)
)

# combine onehot features
train_x = pd.concat([
        train_x.drop(columns=categorial_x),
        encoded_df
    ],
    axis=1,
)

def ip(y, X):
    mod = LogisticRegression(fit_intercept=True, C=5)
    mod.fit(X, y)
    weights = np.zeros(X.shape[0])
    weights[y == 1] = \
        mod.predict_proba(X.loc[y == 1])[:,1]
    weights[y == 0] = \
        mod.predict_proba(X.loc[y == 0])[:,0]
    return weights

# standardized weights
denoms = ip(
    train_x['cum_var'], 
    train_x.drop(columns=['cum_var'])
)

weights = 1 / denoms

treat_mask = (train_x['cum_var']==1)
s_weights = np.zeros(train_x.shape[0])
s_weights[treat_mask] = \
    treat_mask.mean() * weights[treat_mask]
s_weights[~treat_mask] = \
    (1 - treat_mask).mean() * weights[~treat_mask]

# fit gee model
train_x_a_1 = pd.DataFrame({
        'a':train_x['cum_var'], 
        'one':np.ones(train_x.shape[0])
    })
gee = sm.GEE(
    train_y,
    train_x_a_1,
    groups=train_x_a_1.index.tolist(),
    weights=s_weights
)

gee.fit().summary().tables[1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
a,-165.6434,148.026,-1.119,0.263,-455.770,124.483
one,171.5397,148.025,1.159,0.247,-118.585,461.664
