In [1]:
# !pip install lightgbm
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

import bytedtqs
from pytqs import tqs

mpl.rcParams['figure.dpi'] = 300
sns.set_style("whitegrid")
pd.options.display.max_columns = None
%config Completer.use_jedi = False

## Data Cleaning

In [2]:
appId = "eL38NWjpAMXlC0ifCoo71wk27zE6CS9YQismEcwVH5oNKbJf"
appKey = "VeeOKtXLxjGWE2OyIGkuqWRT33isyxtHH1xlxakeFzId6SMB"
user_id = 'wuweiwei.1012'

In [3]:
sql_txt = """ 

set tqs.query.result.max.lines = 1000000000;

select * from dm_eo_test.recite_word_vip limit 1000000000
"""

client = tqs.TQSClient(appId=appId, appKey=appKey)
job = client.execute_query(username = user_id , sql = sql_txt)
data = pd.read_csv(job['resultUrl'])

[2021-04-13 21:23:01,469] - [INFO] - tqs_task_id: 246377698, engine: Hive, status: Created, progress : Submitting
[2021-04-13 21:23:03,500] - [INFO] - tqs_task_id: 246377698, execute id: tqs_246377698-1618320181857
[2021-04-13 21:23:03,501] - [INFO] - application tracking url: http://presto-history.byted.org/proxy/20210413_132301_14671_default
[2021-04-13 21:23:03,501] - [INFO] - tqs_task_id: 246377698, engine: Presto, status: Processing, progress : Accepted
[2021-04-13 21:23:05,539] - [INFO] - tqs_task_id: 246377698, engine: Presto, status: Processing, progress : Accepted
[2021-04-13 21:23:07,575] - [INFO] - tqs_task_id: 246377698, engine: Presto, status: Processing, progress : Accepted
[2021-04-13 21:23:09,600] - [INFO] - tqs_task_id: 246377698, engine: Presto, status: Processing, progress : Accepted
[2021-04-13 21:23:11,634] - [INFO] - tqs_task_id: 246377698, engine: Presto, status: Processing, progress : Accepted
[2021-04-13 21:23:13,663] - [INFO] - tqs_task_id: 246377698, engine: 

In [4]:
# read data
df_raw = data

In [5]:
df_raw.head()

Unnamed: 0,user_id,study_dt,first_milestone_name,occupation,vip_lfc_video,vip_lfc_audio,gambling_count,gambling_status_audio,gambling_status_video,team_count,team_status_audio,team_status_video,scholarship_status_video,scholarship_status_audio,scholarship_succeeded_audio,scholarship_succeeded_video,wemeet_count_all,finish_dubbing_lesson_cnt,words_review_count_all,study_duration_pre,aim_module_user,study_2w5d_yn
0,2422855878,2021-03-16,未定级,office_worker,-1,131,0,0,0,22,1,0,0,1,0,0,0,0,50,0,0,1
1,2422855878,2021-03-17,未定级,office_worker,-1,132,0,0,0,22,1,0,0,1,0,0,0,0,50,2211,0,1
2,2422855878,2021-03-18,未定级,office_worker,-1,133,0,0,0,22,1,0,0,1,0,0,0,0,50,3407,0,1
3,2422855878,2021-03-19,未定级,office_worker,-1,134,0,0,0,22,1,0,0,1,0,0,0,0,50,3407,0,1
4,2422855878,2021-03-20,未定级,office_worker,-1,135,0,0,0,22,1,0,0,1,0,0,0,0,50,7016,0,1


In [6]:
# fill na
category_cols = []

for c in df_raw.columns:
    if df_raw[c].dtype=='O':
        df_raw[c] = df_raw[c].fillna('unknown')
        category_cols.append(c)
    else:
        df_raw[c] = df_raw[c].fillna(0)

In [7]:
# add cum status
# df_sorted = df_raw\
#     .sort_values(by=['user_id', 'gap_days'])
# df_sorted['cum_finish'] = np.where(df_sorted\
#     .groupby(['user_id'])['advise_finish_yn'].cumsum() > 0, 1, 0)
df_sorted = df_raw[df_raw.study_duration_pre > 30]

In [8]:
df_raw.groupby(['aim_module_user']).size()

aim_module_user
0    7630195
1      32912
dtype: int64

In [9]:
# select training data
# _gap_days = 3
_x = [
    'occupation',
    'first_milestone_name',
    'vip_lfc_video',
    'vip_lfc_audio',
    'study_duration_pre',
    'aim_module_user',
    'gambling_count',
    'gambling_status_audio',
    'gambling_status_video',
    'team_count',
    'team_status_audio',
    'team_status_video',
    'scholarship_status_video',
    'scholarship_status_audio',
    'scholarship_succeeded_audio',
    'scholarship_succeeded_video',
    'wemeet_count_all',
    'finish_dubbing_lesson_cnt',
    'words_review_count_all',
]
categorial_x = [
    'first_milestone_name',
    'occupation',
]
_y = 'study_2w5d_yn'

# df_train = df_sorted[lambda x: (x['enter_group_yn']==1)].reset_index()
df_train = df_sorted.reset_index()

In [10]:
df_train_0 = df_train[df_train.aim_module_user == 0].sample(frac = 0.01, replace = False, random_state = 1)
df_train_1 = df_train[df_train.aim_module_user == 1]

df_train = df_train_0.append(df_train_1)

In [11]:
df_train.groupby(['aim_module_user']).size()

aim_module_user
0    33137
1    23063
dtype: int64

## Standardizing

In [12]:
# x & y
train_x = df_train[_x]
train_y = df_train[_y]

# --------------- SEP ---------------
# label encoding
label_encoders = {}
for c in categorial_x:
    encoder = LabelEncoder()
    train_x[c] = encoder.fit_transform(df_train[c].values)
    label_encoders[c] = encoder

# --------------- SEP ---------------
# counter-factual data
train_x_cf_treated = train_x.assign(aim_module_user=lambda x: 1)
train_x_cf_untreated = train_x.assign(aim_module_user=lambda x: 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
# training classifier
mod = LGBMClassifier()
mod.fit(train_x, train_y)

LGBMClassifier()

In [14]:
y_pred_train = mod.predict(train_x)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(train_y, y_pred_train)))

Training-set accuracy score: 0.7822


In [15]:
# predict counter-factual results
pred_cf_treated = mod.predict_proba(train_x_cf_treated)[:,1]
pred_cf_untreated = mod.predict_proba(train_x_cf_untreated)[:,1]

In [16]:
# counter-factual mean diff
mean_t = pred_cf_treated.mean()
mean_u = pred_cf_untreated.mean()
print(f"Counter-factual treated: {mean_t:.4f}, untreated: {mean_u:.4f}, lift: {mean_t - mean_u:.4f}")

Counter-factual treated: 0.7698, untreated: 0.5006, lift: 0.2692


In [17]:
# actual mean diff
df_train.groupby(['aim_module_user']).apply(lambda x: pd.Series({
    'user_cnt': x.shape[0],
    'study_rate': (x['study_2w5d_yn']).mean()
})).reset_index()

Unnamed: 0,aim_module_user,user_cnt,study_rate
0,0,33137.0,0.443553
1,1,23063.0,0.827603


## IP Weighting

In [24]:
# x & y
train_x = df_train[_x]
train_y = df_train[_y]

# --------------- SEP ---------------
# one-hot encoding
encoder = OneHotEncoder(sparse=False)  

encoded_df = pd.DataFrame(
    encoder.fit_transform(train_x[categorial_x]), 
    columns=encoder.get_feature_names(categorial_x)
)

# combine onehot features
train_x = pd.concat([
        train_x.drop(columns=categorial_x),
        encoded_df
    ],
    axis=1,
)

In [25]:
def ip(y, X):
    mod = LogisticRegression(fit_intercept=True, C=5)
    mod.fit(X, y)
    weights = np.zeros(X.shape[0])
    weights[y == 1] = \
        mod.predict_proba(X.loc[y == 1])[:,1]
    weights[y == 0] = \
        mod.predict_proba(X.loc[y == 0])[:,0]
    return weights

# standardized weights
denoms = ip(
    train_x['aim_module_user'], 
    train_x.drop(columns=['aim_module_user'])
)

In [26]:
weights = 1 / denoms

treat_mask = (train_x['aim_module_user']==1)
s_weights = np.zeros(train_x.shape[0])
s_weights[treat_mask] = \
    treat_mask.mean() * weights[treat_mask]
s_weights[~treat_mask] = \
    (1 - treat_mask).mean() * weights[~treat_mask]

In [27]:
# fit gee model
train_x_a_1 = pd.DataFrame({
        'a':train_x['aim_module_user'], 
        'one':np.ones(train_x.shape[0])
    })
gee = sm.GEE(
    train_y,
    train_x_a_1,
    groups=train_x_a_1.index.tolist(),
    weights=s_weights
)

gee.fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
a,0.5058,0.003,168.436,0.000,0.500,0.512
one,0.2022,0.000,1342.444,0.000,0.202,0.202
