In [20]:
# !pip install lightgbm
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

mpl.rcParams['figure.dpi'] = 300
sns.set_style("whitegrid")
pd.options.display.max_columns = None
%config Completer.use_jedi = False

## Data Cleaning

In [21]:
# read data
df_raw = pd.read_csv(
    '/home/tiger/archived-data/aeolus-data/20210413/19/42189423-开言背单词-提升效果-查询17.csv',
    encoding='gbk'
)

In [22]:
df_raw.head()

Unnamed: 0,user_id,install_date,enter_group,occupation,first_milestone_name,city_level,age,edu,gender,channel_group,study_mins,order_yn,aim_module_user
0,2850371729103624,2021-03-27,1,白领上班族,A1高,一线,18-23,high,male,OV,2,0,1
1,3237397476679687,2021-03-20,0,unknown,,四线,31-40,high,female,华为小米,0,0,0
2,88396189668840,2021-03-20,0,白领上班族,A1高,四线,41-50,high,male,华为小米,0,0,0
3,1249480583421064,2021-03-20,0,中小学生,A1中,三线,41-50,high,female,其他,0,0,0
4,1953174379308254,2021-04-07,1,白领上班族,,,24-30,high,male,Apple Store,6,0,0


In [23]:
# fill na
category_cols = []

for c in df_raw.columns:
    if df_raw[c].dtype=='O':
        df_raw[c] = df_raw[c].fillna('unknown')
        category_cols.append(c)
    else:
        df_raw[c] = df_raw[c].fillna(0)

In [24]:
# add cum status
# df_sorted = df_raw\
#     .sort_values(by=['user_id', 'gap_days'])
# df_sorted['cum_finish'] = np.where(df_sorted\
#     .groupby(['user_id'])['advise_finish_yn'].cumsum() > 0, 1, 0)
df_sorted = df_raw[df_raw.study_mins > 0]

In [25]:
df_sorted.groupby(['aim_module_user']).size()

aim_module_user
0    65681
1    28281
dtype: int64

In [42]:
# select training data
# _gap_days = 3
_x = [
    'occupation',
    'first_milestone_name',
    'edu',
    'channel_group',
    'gender',
    'age',
    'city_level',
    'aim_module_user',
    'study_mins',
    'enter_group'
]
categorial_x = [
    'age',
    'first_milestone_name',
    'edu',
    'occupation',
    'city_level',
    'gender',
    'channel_group',
]
_y = 'order_yn'

df_train = df_sorted[lambda x: (x['enter_group']==0)].reset_index()
# df_train = df_sorted.reset_index()

In [43]:
df_train.head()

Unnamed: 0,index,user_id,install_date,enter_group,occupation,first_milestone_name,city_level,age,edu,gender,channel_group,study_mins,order_yn,aim_module_user
0,11,3765168084888824,2021-04-03,0,中小学生,unknown,五线,18-23,high,male,其他,1,0,1
1,27,2955928128461463,2021-04-05,0,白领上班族,unknown,unknown,18-23,college,male,Apple Store,4,0,0
2,33,4416077646530990,2021-03-30,0,中小学生,A1初,四线,31-40,high,female,OV,2,0,1
3,36,2129091693127591,2021-03-25,0,中小学生,A2中,四线,50-,high,female,华为小米,56,0,1
4,37,3554056924052248,2021-03-20,0,白领上班族,unknown,unknown,24-30,high,male,Google-FB,7,0,0


## Standardizing

In [44]:
# x & y
train_x = df_train[_x]
train_y = df_train[_y]

# --------------- SEP ---------------
# label encoding
label_encoders = {}
for c in categorial_x:
    encoder = LabelEncoder()
    train_x[c] = encoder.fit_transform(df_train[c].values)
    label_encoders[c] = encoder

# --------------- SEP ---------------
# counter-factual data
train_x_cf_treated = train_x.assign(aim_module_user=lambda x: 1)
train_x_cf_untreated = train_x.assign(aim_module_user=lambda x: 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [45]:
# training classifier
mod = LGBMClassifier()
mod.fit(train_x, train_y)

LGBMClassifier()

In [46]:
y_pred_train = mod.predict(train_x)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(train_y, y_pred_train)))

Training-set accuracy score: 0.9943


In [47]:
# predict counter-factual results
pred_cf_treated = mod.predict_proba(train_x_cf_treated)[:,1]
pred_cf_untreated = mod.predict_proba(train_x_cf_untreated)[:,1]

In [48]:
# counter-factual mean diff
mean_t = pred_cf_treated.mean()
mean_u = pred_cf_untreated.mean()
print(f"Counter-factual treated: {mean_t:.4f}, untreated: {mean_u:.4f}, lift: {mean_t - mean_u:.4f}")

Counter-factual treated: 0.0086, untreated: 0.0072, lift: 0.0014


In [49]:
# actual mean diff
df_train.groupby(['aim_module_user']).apply(lambda x: pd.Series({
    'user_cnt': x.shape[0],
    'conversion_rate': (x['order_yn']).mean()
})).reset_index()

Unnamed: 0,aim_module_user,user_cnt,conversion_rate
0,0,43820.0,0.006732
1,1,18275.0,0.010342


## IP Weighting

In [16]:
# x & y
train_x = df_train[_x]
train_y = df_train[_y]

# --------------- SEP ---------------
# one-hot encoding
encoder = OneHotEncoder(sparse=False)  

encoded_df = pd.DataFrame(
    encoder.fit_transform(train_x[categorial_x]), 
    columns=encoder.get_feature_names(categorial_x)
)

# combine onehot features
train_x = pd.concat([
        train_x.drop(columns=categorial_x),
        encoded_df
    ],
    axis=1,
)

In [17]:
def ip(y, X):
    mod = LogisticRegression(fit_intercept=True, C=5)
    mod.fit(X, y)
    weights = np.zeros(X.shape[0])
    weights[y == 1] = \
        mod.predict_proba(X.loc[y == 1])[:,1]
    weights[y == 0] = \
        mod.predict_proba(X.loc[y == 0])[:,0]
    return weights

# standardized weights
denoms = ip(
    train_x['aim_module_user'], 
    train_x.drop(columns=['aim_module_user'])
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
weights = 1 / denoms

treat_mask = (train_x['aim_module_user']==1)
s_weights = np.zeros(train_x.shape[0])
s_weights[treat_mask] = \
    treat_mask.mean() * weights[treat_mask]
s_weights[~treat_mask] = \
    (1 - treat_mask).mean() * weights[~treat_mask]

In [19]:
# fit gee model
train_x_a_1 = pd.DataFrame({
        'a':train_x['aim_module_user'], 
        'one':np.ones(train_x.shape[0])
    })
gee = sm.GEE(
    train_y,
    train_x_a_1,
    groups=train_x_a_1.index.tolist(),
    weights=s_weights
)

gee.fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
a,0.0415,0.003,12.447,0.000,0.035,0.048
one,0.0031,0.003,1.044,0.296,-0.003,0.009
