# 数据处理&特征工程

In [None]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import xgboost as xgb
import lightgbm as lgb
import shap
import statsmodels.api as sm

import utils

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance

# pd.set_option('display.max_columns', None)
# pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')

KeyboardInterrupt: 

In [None]:
xgb.__version__

In [None]:
lgb.__version__

## 数据

* fusion

In [None]:
# df_fusion = pd.read_csv('data/sample_label_feature_fusion_new_dream_20221120_20221211.txt', sep='\t', encoding='utf-8')

df_fusion = pd.read_csv('data/sample_label_feature_increment_fusion_new_20230813_20230903.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
df_fusion.head()

In [None]:
[x for x in df_fusion.columns if x.endswith('.1')]

In [None]:
df_fusion.info()

In [None]:
df_fusion['obs_dt'].value_counts()

In [None]:
df_fusion['card'].value_counts()

In [None]:
df_fusion['label'].value_counts()

In [None]:
# 271663 / 271663
569717 / 569717

In [None]:
df_fusion[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])

# utils.save_pickle(df_fusion, 'data/other/balance/feats/df_fusion_20221120_20221211.pickle')
utils.save_pickle(df_fusion, 'data/other/increment_balance_24/df_fusion_20230813_20220903.pickle')

* aspiration

In [None]:
# df_aspiration = pd.read_csv('data/sample_label_feature_aspiration_new_dream_part1_20221120_20221211.txt', sep='\t', encoding='utf-8')
# df_aspiration = pd.read_csv('data/sample_label_feature_aspiration_new_dream_part2_20221120_20221211.txt', sep='\t', encoding='utf-8')

df_aspiration = pd.read_csv('data/sample_label_feature_increment_aspiration_new_part1_20230813_20230903.txt', sep='\t', encoding='utf-8')
# df_aspiration = pd.read_csv('data/sample_label_feature_increment_aspiration_new_part2_20230813_20230903.txt', sep='\t', encoding='utf-8')

print(df_aspiration.shape)
df_aspiration.head()

In [None]:
list_feats_repeat = [x for x in df_aspiration.columns if x.endswith('.1')]
list_feats_repeat

In [None]:
df_aspiration.info()

In [None]:
df_aspiration['obs_dt'].value_counts()

In [None]:
df_aspiration['card'].value_counts()

In [None]:
df_aspiration['label'].value_counts()

In [None]:
# 271663 / 271663
569717 / 569717

In [None]:
df_aspiration[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_aspiration['obs_dt'] = pd.to_datetime(df_aspiration['obs_dt'])

# utils.save_pickle(df_aspiration, 'data/other/balance/feats/df_aspiration_part1_20221120_20221211.pickle')

utils.save_pickle(df_aspiration, 'data/other/increment_balance_24/df_aspiration_part1_20230813_20230903.pickle')
# utils.save_pickle(df_aspiration, 'data/other/increment_balance_24/df_aspiration_part2_20230813_20230903.pickle')

* debit card

In [None]:
df_dc = pd.read_csv('data/sample_label_feature_increment_debit_card_20230813_20230903.txt', sep='\t', encoding='utf-8')
print(df_dc.shape)
df_dc.head()

In [None]:
list_feats_repeat = [x for x in df_dc.columns if x.endswith('.1')]
list_feats_repeat

In [None]:
df_dc.info()

In [None]:
df_dc['obs_dt'].value_counts()

In [None]:
df_dc['card'].value_counts()

In [None]:
df_dc['label'].value_counts()

In [None]:
569717 / 569717

In [None]:
df_dc[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])

utils.save_pickle(df_dc, 'data/other/increment_balance_24/df_dc_20230813_20230903.pickle')

* flow cashier

In [None]:
# df_fc = pd.read_csv('data/sample_label_feature_flow_cashier_20230730_20230820.txt', sep='\t', encoding='utf-8')

df_fc = pd.read_csv('data/sample_label_feature_increment_flow_cashier_20230813_20230903.txt', sep='\t', encoding='utf-8')

print(df_fc.shape)
df_fc.head()

In [None]:
list_feats_repeat = [x for x in df_fc.columns if x.endswith('.1')]
list_feats_repeat

In [None]:
df_fc.info()

In [None]:
df_fc['obs_dt'].value_counts()

In [None]:
df_fc['card'].value_counts()

In [None]:
df_fc['label'].value_counts()

In [None]:
# 870452 / 870452
569717 / 569717

In [None]:
df_fc[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_fc['obs_dt'] = pd.to_datetime(df_fc['obs_dt'])

# utils.save_pickle(df_fc, 'data/other/balance_25/df_fc_20230730_20230820.pickle')
utils.save_pickle(df_fc, 'data/other/increment_balance_24/df_fc_20230813_20230903.pickle')

* xxpay

In [None]:
df_xxpay = pd.read_csv('data/other/balance_34/sample_label_feature_xxpay_20240225_20240317.txt', sep='\t', encoding='utf-8')

print(df_xxpay.shape)
df_xxpay.head()

In [None]:
list_feats_repeat = [x for x in df_xxpay.columns if x.endswith('.1')]
list_feats_repeat

In [None]:
df_xxpay.info()

In [None]:
df_xxpay['obs_dt'].value_counts()

In [None]:
df_xxpay['card'].value_counts()

In [None]:
df_xxpay['label'].value_counts()

In [None]:
842647 / 842647

In [None]:
df_xxpay[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_xxpay['obs_dt'] = pd.to_datetime(df_xxpay['obs_dt'])

utils.save_pickle(df_xxpay, 'data/other/balance_34/df_xxpay_20240225_20240317.pickle')

## 特征选择

* 方差

In [None]:
# fusion
# df = utils.load_pickle('data/other/balance/feats/df_fusion_20221120_20221211.pickle')

# df = utils.load_pickle('data/other/increment_balance_24/df_fusion_20230813_20220903.pickle')

# aspiration
# df = utils.load_pickle('data/other/balance/feats/df_aspiration_part1_20221120_20221211.pickle')
# df = utils.load_pickle('data/other/balance/feats/df_aspiration_part2_20221120_20221211.pickle')

# df = utils.load_pickle('data/other/increment_balance_24/df_aspiration_part1_20230813_20230903.pickle')
# df = utils.load_pickle('data/other/increment_balance_24/df_aspiration_part2_20230813_20230903.pickle')

# debit card
# df = utils.load_pickle('data/other/increment_balance_24/df_dc_20230813_20230903.pickle')

# flow cashier
# df = utils.load_pickle('data/other/balance_25/df_fc_20230730_20230820.pickle')

# df = utils.load_pickle('data/other/increment_balance_24/df_fc_20230813_20230903.pickle')

# xxpay
df = utils.load_pickle('data/other/balance_34/df_xxpay_20240225_20240317.pickle')

print(df.shape)
df.head()

In [None]:
df_des = utils.df_des(df)
print(df_des.shape)
df_des.head()

In [None]:
# df_des.to_csv('data/df_des_fusion_20221120_20221211.csv', encoding='utf-8')
# df_des.to_csv('data/df_des_aspiration_part1_20221120_20221211.csv', encoding='utf-8')
# df_des.to_csv('data/df_des_aspiration_part2_20221120_20221211.csv', encoding='utf-8')
# df_des.to_csv('data/df_des_fc_20230730_20230820.csv', encoding='utf-8')

# df_des.to_csv('data/other/increment_balance_24/df_des_fusion_20230813_20230903.csv', encoding='utf-8')
# df_des.to_csv('data/other/increment_balance_24/df_des_asp1_20230813_20230903.csv', encoding='utf-8')
# df_des.to_csv('data/other/increment_balance_24/df_des_asp2_20230813_20230903.csv', encoding='utf-8')
# df_des.to_csv('data/other/increment_balance_24/df_des_dc_20230813_20230903.csv', encoding='utf-8')
# df_des.to_csv('data/other/increment_balance_24/df_des_fc_20230813_20230903.csv', encoding='utf-8')

df_des.to_csv('data/other/balance_34/df_des_xxapy_20240225_20240317.csv', encoding='utf-8')

In [None]:
list_feats_id_dt_card_y = ['uid', 'obs_dt', 'card', 'label']
list_feats_x = [x for x in df.columns if x not in list_feats_id_dt_card_y]
print(len(list_feats_x))
list_feats_x[:10]

In [None]:
# utils.save_pickle(list_feats_x, 'data/other/balance/feats/list_feats/list_feats_x_fusion_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x, 'data/other/balance/feats/list_feats/list_feats_x_aspiration_part1_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x, 'data/other/balance/feats/list_feats/list_feats_x_aspiration_part2_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x, 'data/other/balance_25/list_feats/list_feats_x_fc_20230730_20230820.pickle')

# utils.save_pickle(list_feats_x, 'data/other/increment_balance_24/list_feats/list_feats_x_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x, 'data/other/increment_balance_24/list_feats/list_feats_x_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x, 'data/other/increment_balance_24/list_feats/list_feats_x_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x, 'data/other/increment_balance_24/list_feats/list_feats_x_dc_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x, 'data/other/increment_balance_24/list_feats/list_feats_x_fc_20230813_20230903.pickle')

utils.save_pickle(list_feats_x, 'data/other/balance_34/list_feats/list_feats_x_xxpay_20240225_20240317.pickle')

In [None]:
list_feats_x_std_0 = [x for x in df_des[df_des['std']==0].index]
print(len(list_feats_x_std_0))
list_feats_x_std_0[:10]

In [None]:
# utils.save_pickle(list_feats_x_std_0, 'data/other/balance/feats/list_feats/list_feats_x_std_0_fusion_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/balance/feats/list_feats/list_feats_x_std_0_aspiration_part1_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/balance/feats/list_feats/list_feats_x_std_0_aspiration_part2_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/balance_25/list_feats/list_feats_x_std_0_fc_20230730_20230820.pickle')

# utils.save_pickle(list_feats_x_std_0, 'data/other/increment_balance_24/list_feats/list_feats_x_std_0_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/increment_balance_24/list_feats/list_feats_x_std_0_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/increment_balance_24/list_feats/list_feats_x_std_0_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/increment_balance_24/list_feats/list_feats_x_std_0_dc_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std_0, 'data/other/increment_balance_24/list_feats/list_feats_x_std_0_fc_20230813_20230903.pickle')

utils.save_pickle(list_feats_x_std_0, 'data/other/balance_34/list_feats/list_feats_x_std_0_xxpay_20240225_20240317.pickle')

In [None]:
list_feats_x_std = [x for x in list_feats_x if x not in list_feats_x_std_0]
print(len(list_feats_x_std))
list_feats_x_std[:10]

In [None]:
# utils.save_pickle(list_feats_x_std, 'data/other/balance/feats/list_feats/list_feats_x_std_fusion_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/balance/feats/list_feats/list_feats_x_std_aspiration_part1_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/balance/feats/list_feats/list_feats_x_std_aspiration_part2_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/balance_25/list_feats/list_feats_x_std_fc_20230730_20230820.pickle')

# utils.save_pickle(list_feats_x_std, 'data/other/increment_balance_24/list_feats/list_feats_x_std_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/increment_balance_24/list_feats/list_feats_x_std_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/increment_balance_24/list_feats/list_feats_x_std_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/increment_balance_24/list_feats/list_feats_x_std_dc_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_std, 'data/other/increment_balance_24/list_feats/list_feats_x_std_fc_20230813_20230903.pickle')

utils.save_pickle(list_feats_x_std, 'data/other/balance_34/list_feats/list_feats_x_std_xxpay_20240225_20240317.pickle')

* 多因素方差分析（假设检验，整体（先验+特征）f检验）

In [None]:
# list_feats_x_std = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_std_fusion_20221120_20221211.pickle')
# list_feats_x_std = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_std_aspiration_part1_20221120_20221211.pickle')
# list_feats_x_std = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_std_aspiration_part2_20221120_20221211.pickle')
# list_feats_x_std = utils.load_pickle('data/other/balance_25/list_feats/list_feats_x_std_fc_20230730_20230820.pickle')

# list_feats_x_std = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_std_fusion_20230813_20230903.pickle')
# list_feats_x_std = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_std_asp1_20230813_20230903.pickle')
# list_feats_x_std = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_std_asp2_20230813_20230903.pickle')
# list_feats_x_std = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_std_dc_20230813_20230903.pickle')
# list_feats_x_std = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_std_fc_20230813_20230903.pickle')

list_feats_x_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_std_xxpay_20240225_20240317.pickle')

print(len(list_feats_x_std))
list_feats_x_std[:10]

In [None]:
# card编码
# 目标item卡（新）
# dict_card = utils.load_pickle('data/dict_card_new.pickle')
# dict_card = utils.load_pickle('data/other/balance_24/dict_card_24.pickle')
dict_card = utils.load_pickle('data/other/balance_34/dict_card_34.pickle')
# dict_card = {
    'item1-a': 10, 
    'item1-b': 11, 
# }
dict_card

In [None]:
df['card_id'] = df['card']
df['card_id'].replace(dict_card, inplace=True)
df.head()

In [None]:
df[['card', 'card_id', 'uid']].groupby(['card', 'card_id']).count()

In [None]:
df['card_id'].nunique()

In [None]:
# item特征处理
# one hot encoding
list_feats_ohe = ['card_id']
list_df_ohe_card = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils.one_hot_encoder(df, feat)
            list_df_ohe_card.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_card = pd.concat(list_df_ohe_card, axis=1)
print(df_ohe_card.shape)
df_ohe_card.head()

In [None]:
df_ohe = pd.concat([df, df_ohe_card], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
def multi_anova_f(data, label, list_item_feats, user_feat):
    Y = data['label']
    X = data[list_item_feats+[user_feat]]
    X = sm.axx_constant(X)
    
    model = sm.OLS(Y, X)
    result = model.fit()
    
    p = result.f_test(np.array([0]*(len(list_item_feats)+1)+[1])).pvalue
    
    return pd.DataFrame({'feature': [user_feat], 'p_value': [p]})


def apply_parallel(func, data, list_feats, label, list_item_feats):
    try:
        with tqdm(list_feats) as t:
            list_paraller = Parallel(n_jobs=8)(delayed(func)(data[[label]+list_item_feats+[user_feat]], label, list_item_feats, user_feat) for user_feat in t)
    except KeyboardInterrupt:
        t.close()
        raise
    t.close()
    
    return list_paraller

In [None]:
list_item_feats = [x for x in df_ohe_card.columns]

list_df_multi_anova_f = []

try:
    with tqdm(list_feats_x_std) as t:
        for user_feat in t:
            df_multi_anova_f_tmp = multi_anova_f(df_ohe[['label']+list_item_feats+[user_feat]], 'label', list_item_feats, user_feat)
            list_df_multi_anova_f.append(df_multi_anova_f_tmp)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

# list_df_multi_anova_f = apply_parallel(multi_anova_f, df_ohe, list_feats_x_std, 'label', list_item_feats)

df_multi_anova_f = pd.concat(list_df_multi_anova_f, axis=0)
df_multi_anova_f.reset_index(drop=True, inplace=True)
print(df_multi_anova_f.shape)
df_multi_anova_f.head()

In [None]:
# utils.save_pickle(df_multi_anova_f, 'data/other/increment_balance_24/list_feats/df_multi_anova_f_fusion_20230813_20230903.pickle')
# utils.save_pickle(df_multi_anova_f, 'data/other/increment_balance_24/list_feats/df_multi_anova_f_asp1_20230813_20230903.pickle')
# utils.save_pickle(df_multi_anova_f, 'data/other/increment_balance_24/list_feats/df_multi_anova_f_asp2_20230813_20230903.pickle')
# utils.save_pickle(df_multi_anova_f, 'data/other/increment_balance_24/list_feats/df_multi_anova_f_dc_20230813_20230903.pickle')
# utils.save_pickle(df_multi_anova_f, 'data/other/increment_balance_24/list_feats/df_multi_anova_f_fc_20230813_20230903.pickle')

utils.save_pickle(df_multi_anova_f, 'data/other/balance_34/list_feats/df_multi_anova_f_xxpay_20240225_20240317.pickle')

In [None]:
# df_multi_anova_f = utils.load_pickle('data/other/increment_balance_24/list_feats/df_multi_anova_f_asp1_20230813_20230903.pickle')

df_multi_anova_f = utils.load_pickle('data/other/balance_34/list_feats/df_multi_anova_f_xxpay_20240225_20240317.pickle')

print(df_multi_anova_f.shape)
df_multi_anova_f.head()

In [None]:
list_feats_x_ht_no_select = list(df_multi_anova_f[df_multi_anova_f['p_value']>0.05]['feature'])
list_feats_x_ht_no_select = [x for x in list_feats_x_std if x in list_feats_x_ht_no_select]
print(len(list_feats_x_ht_no_select))
list_feats_x_ht_no_select[:10]

In [None]:
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/balance/feats/list_feats/list_feats_x_ht_no_select_fusion_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/balance/feats/list_feats/list_feats_x_ht_no_select_aspiration_part1_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/balance/feats/list_feats/list_feats_x_ht_no_select_aspiration_part2_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/balance_25/list_feats/list_feats_x_ht_no_select_fc_20230730_20230820.pickle')

# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_dc_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_fc_20230813_20230903.pickle')

utils.save_pickle(list_feats_x_ht_no_select, 'data/other/balance_34/list_feats/list_feats_x_ht_no_select_xxpay_20240225_20240317.pickle')

In [None]:
# list_feats_x_ht_no_select = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_no_select_fusion_20221120_20221211.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_no_select_aspiration_part1_20221120_20221211.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_no_select_aspiration_part2_20221120_20221211.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/balance_25/list_feats/list_feats_x_ht_no_select_fc_20230730_20230820.pickle')

# list_feats_x_ht_no_select = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_fusion_20230813_20230903.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_asp1_20230813_20230903.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_asp2_20230813_20230903.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_dc_20230813_20230903.pickle')
# list_feats_x_ht_no_select = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_no_select_fc_20230813_20230903.pickle')

list_feats_x_ht_no_select = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_ht_no_select_xxpay_20240225_20240317.pickle')

print(len(list_feats_x_ht_no_select))
list_feats_x_ht_no_select[:10]

In [None]:
list_feats_x_ht = [x for x in list_feats_x_std if x not in list_feats_x_ht_no_select]
print(len(list_feats_x_ht))
list_feats_x_ht[:10]

In [None]:
# utils.save_pickle(list_feats_x_ht, 'data/other/balance/feats/list_feats/list_feats_x_ht_fusion_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/balance/feats/list_feats/list_feats_x_ht_aspiration_part1_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/balance/feats/list_feats/list_feats_x_ht_aspiration_part2_20221120_20221211.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/balance_25/list_feats/list_feats_x_ht_fc_20230730_20230820.pickle')

# utils.save_pickle(list_feats_x_ht, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_dc_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_ht, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_fc_20230813_20230903.pickle')

utils.save_pickle(list_feats_x_ht, 'data/other/balance_34/list_feats/list_feats_x_ht_xxpay_20240225_20240317.pickle')

* 模型
* 特征重要性
* PI方法，使用lgb进行建模预测

In [None]:
df_fusion = utils.load_pickle('data/other/increment_balance_24/df_fusion_20230813_20220903.pickle')
df_asp1 = utils.load_pickle('data/other/increment_balance_24/df_aspiration_part1_20230813_20230903.pickle')
df_asp2 = utils.load_pickle('data/other/increment_balance_24/df_aspiration_part2_20230813_20230903.pickle')
df_dc = utils.load_pickle('data/other/increment_balance_24/df_dc_20230813_20230903.pickle')
df_fc = utils.load_pickle('data/other/increment_balance_24/df_fc_20230813_20230903.pickle')

print(df_fusion.shape)
print(df_asp1.shape)
print(df_asp2.shape)
print(df_dc.shape)
print(df_fc.shape)

In [None]:
list_feats_id_dt_card_y = ['uid', 'obs_dt', 'card', 'label']

list_feats_x_fusion = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fusion_20230813_20230903.pickle')
list_feats_x_asp1 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_20230813_20230903.pickle')
list_feats_x_asp2 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp2_20230813_20230903.pickle')
list_feats_x_dc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_dc_20230813_20230903.pickle')
list_feats_x_fc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fc_20230813_20230903.pickle')

print(len(list_feats_x_fusion))
print(len(list_feats_x_asp1))
print(len(list_feats_x_asp2))
print(len(list_feats_x_dc))
print(len(list_feats_x_fc))

In [None]:
# 存在特征列名重复
list_feats_repeat = [x for x in list_feats_x_asp1 if x in list_feats_x_fc]
list_feats_repeat

In [None]:
list_feats_x_fc_ = list(map(lambda x: x+'_1' if x in list_feats_repeat else x, 
                       list_feats_x_fc))
print(len(list_feats_x_fc_))
list_feats_x_fc_[:10]

In [None]:
[x for x in list_feats_x_asp1 if x in list_feats_x_fc_]

In [None]:
utils.save_pickle(list_feats_x_fc_, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_fc_rename_20230813_20230903.pickle')

In [None]:
list_feats_x_fc_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_fc_20230813_20230903.pickle')
print(len(list_feats_x_fc_all))
list_feats_x_fc_all[:10]

In [None]:
list_feats_x_fc_all_ = list(map(lambda x: x+'_1' if x in list_feats_repeat else x, 
                                list_feats_x_fc_all))
print(len(list_feats_x_fc_all_))
list_feats_x_fc_all_[:10]

In [None]:
[x for x in list_feats_x_asp1 if x in list_feats_x_fc_all_]

In [None]:
utils.save_pickle(list_feats_x_fc_all_, 'data/other/increment_balance_24/list_feats/list_feats_x_fc_rename_20230813_20230903.pickle')

In [None]:
dict_rename = {x: x+'_1' for x in list_feats_repeat}
dict_rename

In [None]:
df_fc.rename(columns=dict_rename, inplace=True)

In [None]:
# 存在部分重复负样本，采用排序后合并方式
df_fusion.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_fusion.reset_index(drop=True, inplace=True)

df_asp1.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_asp1.reset_index(drop=True, inplace=True)

df_asp2.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_asp2.reset_index(drop=True, inplace=True)

df_dc.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_dc.reset_index(drop=True, inplace=True)

df_fc.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_fc.reset_index(drop=True, inplace=True)

In [None]:
# 校验id
df_fusion_id = df_fusion[list_feats_id_dt_card_y]
df_fusion_id.rename(columns={'uid': 'uid_f', 
                             'obs_dt': 'obs_dt_f', 
                             'card': 'card_f', 
                             'label': 'label_f'}, 
                    inplace=True)

df_asp1_id = df_asp1[list_feats_id_dt_card_y]
df_asp1_id.rename(columns={'uid': 'uid_ap1', 
                           'obs_dt': 'obs_dt_ap1', 
                           'card': 'card_ap1', 
                           'label': 'label_ap1'}, 
                  inplace=True)

df_asp2_id = df_asp2[list_feats_id_dt_card_y]
df_asp2_id.rename(columns={'uid': 'uid_ap2', 
                           'obs_dt': 'obs_dt_ap2', 
                           'card': 'card_ap2', 
                           'label': 'label_ap2'}, 
                  inplace=True)

df_dc_id = df_dc[list_feats_id_dt_card_y]
df_dc_id.rename(columns={'uid': 'uid_dc', 
                         'obs_dt': 'obs_dt_dc', 
                         'card': 'card_dc', 
                         'label': 'label_dc'}, 
                inplace=True)

df_fc_id = df_fc[list_feats_id_dt_card_y]
df_fc_id.rename(columns={'uid': 'uid_fc', 
                         'obs_dt': 'obs_dt_fc', 
                         'card': 'card_fc', 
                         'label': 'label_fc'}, 
                inplace=True)

df_id = pd.concat([df_fusion_id, df_asp1_id, df_asp2_id, df_dc_id, df_fc_id], axis=1)
print(df_id.shape)

print('fusion - aspiration part 1:', [sum(df_id[x+'_f']==df_id[x+'_ap1']) for x in list_feats_id_dt_card_y])
print('fusion - aspiration part 2:', [sum(df_id[x+'_f']==df_id[x+'_ap2']) for x in list_feats_id_dt_card_y])
print('fusion - debit card:', [sum(df_id[x+'_f']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('fusion - flow cashier:', [sum(df_id[x+'_f']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])

print('aspiration part 1 - aspiration part 2:', [sum(df_id[x+'_ap1']==df_id[x+'_ap2']) for x in list_feats_id_dt_card_y])
print('aspiration part 1 - debit card:', [sum(df_id[x+'_ap1']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('aspiration part 1 - flow cashier:', [sum(df_id[x+'_ap1']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])

print('aspiration part 2 - debit card:', [sum(df_id[x+'_ap2']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('aspiration part 2 - flow cashier:', [sum(df_id[x+'_ap2']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])

print('debit card - flow cashier:', [sum(df_id[x+'_dc']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])

In [None]:
df = pd.concat([df_fusion[list_feats_id_dt_card_y+list_feats_x_fusion], 
                df_asp1[list_feats_x_asp1], 
                df_asp2[list_feats_x_asp2], 
                df_dc[list_feats_x_dc], 
                df_fc[list_feats_x_fc_]
               ], axis=1)
print(df.shape)
df.head()

In [None]:
utils.save_pickle(df, 'data/other/increment_balance_24/df_fadf_20230813_20230903.pickle')

In [None]:
df = utils.load_pickle('data/other/increment_balance_24/df_fadf_20230813_20230903.pickle')
print(df.shape)
df.head()

In [None]:
df_des = utils.df_des(df)

df_des.to_csv('data/other/increment_balance_24/df_des_fadf_20230813_20230903.csv')

In [None]:
# card编码
# 目标item卡（新）
dict_card = utils.load_pickle('data/other/balance_24/dict_card_24.pickle')
dict_card

In [None]:
df['card_id'] = df['card']
df['card_id'].replace(dict_card, inplace=True)
df.head()

In [None]:
df[['card', 'card_id', 'uid']].groupby(['card', 'card_id']).count()

In [None]:
df['card_id'].nunique()

In [None]:
# item特征处理
# one hot encoding
list_feats_ohe = ['card_id']
list_df_ohe_card = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils.one_hot_encoder(df, feat)
            list_df_ohe_card.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_card = pd.concat(list_df_ohe_card, axis=1)
print(df_ohe_card.shape)
df_ohe_card.head()

In [None]:
df_ohe = pd.concat([df, df_ohe_card], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
utils.save_pickle(df_ohe, 'data/other/increment_balance_24/df_fadf_ohe_20230813_20230903.pickle')

In [None]:
df_ohe = utils.load_pickle('data/other/increment_balance_24/df_fadf_ohe_20230813_20230903.pickle')
print(df_ohe.shape)
df_ohe.head()

In [None]:
list_feats_item = [x for x in df_ohe.columns if x.startswith('card_id_')]

list_feats_x_fusion = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fusion_20230813_20230903.pickle')
list_feats_x_asp1 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_20230813_20230903.pickle')
list_feats_x_asp2 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp2_20230813_20230903.pickle')
list_feats_x_dc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_dc_20230813_20230903.pickle')
list_feats_x_fc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fc_rename_20230813_20230903.pickle')

list_feats_x = list_feats_x_fusion + list_feats_x_asp1 + list_feats_x_asp2 + list_feats_x_dc + list_feats_x_fc

print(len(list_feats_item))

print(len(list_feats_x_fusion))
print(len(list_feats_x_asp1))
print(len(list_feats_x_asp2))
print(len(list_feats_x_dc))
print(len(list_feats_x_fc))

print(len(list_feats_x))

In [None]:
# PI
df_ohe_y = df_ohe['label']
df_ohe_X = df_ohe[list_feats_item+list_feats_x]

print(df_ohe_y.shape)
print(df_ohe_X.shape)

In [None]:
%xdel df_ohe

In [None]:
df_ohe_X_train, df_ohe_X_test, df_ohe_y_train, df_ohe_y_test = \
    train_test_split(df_ohe_X, df_ohe_y, test_size=0.2, random_state=2023)
print(df_ohe_X_train.shape)
print(df_ohe_X_test.shape)
print(df_ohe_y_train.shape)
print(df_ohe_y_test.shape)

In [None]:
utils.save_pickle(df_ohe_X_train, 'data/other/increment_balance_24/pi/df_item_X_train_20230813_20230903.pickle')
utils.save_pickle(df_ohe_X_test, 'data/other/increment_balance_24/pi/df_item_X_test_20230813_20230903.pickle')
utils.save_pickle(df_ohe_y_train, 'data/other/increment_balance_24/pi/df_y_train_20230813_20230903.pickle')
utils.save_pickle(df_ohe_y_test, 'data/other/increment_balance_24/pi/df_y_test_20230813_20230903.pickle')

In [None]:
df_X_train = utils.load_pickle('data/other/increment_balance_24/pi/df_item_X_train_20230813_20230903.pickle')
df_y_train = utils.load_pickle('data/other/increment_balance_24/pi/df_y_train_20230813_20230903.pickle')

print(df_X_train.shape)
print(df_y_train.shape)

In [None]:
%%time
estimator_pi = lgb.LGBMClassifier(importance_type='gain')
estimator_pi.fit(df_X_train, df_y_train)

utils.save_pickle(estimator_pi, 'data/other/increment_balance_24/pi/estimator_pi_item_X_20230813_20230903.pickle')

In [None]:
df_X_test = utils.load_pickle('data/other/increment_balance_24/pi/df_item_X_test_20230813_20230903.pickle')
df_y_test = utils.load_pickle('data/other/increment_balance_24/pi/df_y_test_20230813_20230903.pickle')

print(df_X_test.shape)
print(df_y_test.shape)

In [None]:
estimator_pi = utils.load_pickle('data/other/increment_balance_24/pi/estimator_pi_item_X_20230813_20230903.pickle')

estimator_pi

In [None]:
%%time
pi = permutation_importance(estimator=estimator_pi, X=df_X_test, y=df_y_test, n_jobs=8)

utils.save_pickle(pi, 'data/other/increment_balance_24/pi/pi_item_X_20230813_20230903.pickle')

In [None]:
list_feats_item = [x for x in df_X_test.columns if x.startswith('card_id_')]

list_feats_x_fusion = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fusion_20230813_20230903.pickle')
list_feats_x_asp1 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_20230813_20230903.pickle')
list_feats_x_asp2 = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp2_20230813_20230903.pickle')
list_feats_x_dc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_dc_20230813_20230903.pickle')
list_feats_x_fc = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_fc_rename_20230813_20230903.pickle')

list_feats_x = list_feats_x_fusion + list_feats_x_asp1 + list_feats_x_asp2 + list_feats_x_dc + list_feats_x_fc

print(len(list_feats_item))

print(len(list_feats_x_fusion))
print(len(list_feats_x_asp1))
print(len(list_feats_x_asp2))
print(len(list_feats_x_dc))
print(len(list_feats_x_fc))

print(len(list_feats_x))

In [None]:
df_pi = pd.DataFrame({'feature': list_feats_item+list_feats_x, 'permutation_importance_mean': pi.importances_mean})
print(df_pi.shape)
df_pi.head()

In [None]:
df_pi.sort_values(by=['permutation_importance_mean'], ascending=[False], inplace=True)
df_pi

In [None]:
df_pi[['permutation_importance_mean']].quantile([x/10 for x in range(11)]).T

In [None]:
list_feats_x_pi_no_select = df_pi[
    (df_pi['permutation_importance_mean']<=0.0)
    &(~df_pi['feature'].isin(list_feats_item))
]['feature'].values.tolist()
print(len(list_feats_x_pi_no_select))
list_feats_x_pi_no_select[:10]

In [None]:
utils.save_pickle(list_feats_x_pi_no_select, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_no_select_fadf_20230813_20230903.pickle')

In [None]:
list_feats_x_pi = [x for x in list_feats_x if x not in list_feats_x_pi_no_select]
print(len(list_feats_x_pi))
list_feats_x_pi[:10]

In [None]:
utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_fadf_20230813_20230903.pickle')

## 全量数据拉取sql处理

In [None]:
def del_sql_coalesce(sql_file_path, special_words, sql_file_path_new):
    with open(sql_file_path, 'r') as fi:
        list_sql = fi.readlines()
        
        str_pattern = r'|'.join(['( '+x+', )' for x in special_words])
        str_pattern_end = r'|'.join(['( '+x+' )' for x in special_words])
        str_pattern_from = r'^from '
        pattern = re.compile(str_pattern)
        pattern_end = re.compile(str_pattern_end)
        pattern_from = re.compile(str_pattern_from)
        
        list_sql_valid = []
        list_sql_valid_front = []
        list_sql_valid_back = []
        
        index_front = 0
        for i in range(len(list_sql)):
            if re.search(pattern_from, list_sql[i]):
                index_front = i
                break
        list_sql_valid_back = list_sql[index_front+1:]
        
        try:
            with tqdm(list(range(len(list_sql[:index_front])))) as t:
                for i in t:
                    if i == index_front - 1:
                        if not re.search(str_pattern_end, list_sql[i]):
                            list_sql_valid_front.append(list_sql[i])
                    else:
                        if not re.search(pattern, list_sql[i]):
                            list_sql_valid_front.append(list_sql[i])
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()
        list_sql_valid = list_sql_valid_front + [list_sql[index_front]] + list_sql_valid_back
        
        with open(sql_file_path_new, 'w') as fo:
            fo.writelines(list_sql_valid)
            

def del_sql_select_origin(sql_file_path, special_words, sql_file_path_new):
    with open(sql_file_path, 'r') as fi:
        list_sql = fi.readlines()
        
        str_pattern_1 = r'|'.join(['(\t'+x+', )' for x in special_words])
        str_pattern_2 = r'|'.join(['(    '+x+', )' for x in special_words])
        str_pattern_3 = r'|'.join(['(\t'+x+' )' for x in special_words])
        str_pattern_4 = r'|'.join(['(    '+x+' )' for x in special_words])
        str_pattern_from = r'^from '
        pattern_1 = re.compile(str_pattern_1)
        pattern_2 = re.compile(str_pattern_2)
        pattern_3 = re.compile(str_pattern_3)
        pattern_4 = re.compile(str_pattern_4)
        pattern_from = re.compile(str_pattern_from)
        
        list_sql_valid = []
        list_sql_valid_front = []
        list_sql_valid_back = []
        
        index_front = 0
        for i in range(len(list_sql)):
            if re.search(pattern_from, list_sql[i]):
                index_front = i
                break
        list_sql_valid_front = list_sql[:index_front]
        
        try:
            with tqdm(list(range(len(list_sql[index_front+1:])))) as t:
                for i in t:
                    if not re.search(pattern_1, list_sql[index_front+1+i]) \
                        and not re.search(pattern_2, list_sql[index_front+1+i]) \
                        and not re.search(pattern_3, list_sql[index_front+1+i]) \
                        and not re.search(pattern_4, list_sql[index_front+1+i]):
                        list_sql_valid_back.append(list_sql[index_front+1+i])
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()
        list_sql_valid = list_sql_valid_front + [list_sql[index_front]] + list_sql_valid_back
        
        with open(sql_file_path_new, 'w') as fo:
            fo.writelines(list_sql_valid)
            
            
def correct_end(sql_file_path, sql_file_path_new):
    with open(sql_file_path, 'r') as fi:
        list_sql = fi.readlines()
        
        str_pattern = r'from '
        pattern = re.compile(str_pattern)
        
        try:
            with tqdm(list(range(len(list_sql)))) as t:
                for i in t:
                    if re.search(pattern, list_sql[i]):
                        if list_sql[i-1][-3:-1] == ', ':
                            list_sql[i-1] = list_sql[i-1][:-3] + list_sql[i-1][-2:]
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()
        
        with open(sql_file_path_new, 'w') as fo:
            fo.writelines(list_sql)

In [None]:
# list_feats_x = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_fusion_20221120_20221211.pickle')
# list_feats_x = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_aspiration_part1_20221120_20221211.pickle')
# list_feats_x = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_ht_aspiration_part2_20221120_20221211.pickle')
# list_feats_x = utils.load_pickle('data/other/balance_25/list_feats/list_feats_x_ht_fc_20230730_20230820.pickle')

# list_feats_x = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_pi_fadf_20230813_20230903.pickle')

list_feats_x = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

print(len(list_feats_x))
list_feats_x[:10]

In [None]:
# list_feats_x_all = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_fusion_20221120_20221211.pickle')
# list_feats_x_all = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_aspiration_part1_20221120_20221211.pickle')
# list_feats_x_all = utils.load_pickle('data/other/balance/feats/list_feats/list_feats_x_aspiration_part2_20221120_20221211.pickle')
# list_feats_x_all = utils.load_pickle('data/other/balance_25/list_feats/list_feats_x_fc_20230730_20230820.pickle')

# list_feats_x_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_fusion_20230813_20230903.pickle')
# list_feats_x_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_asp1_20230813_20230903.pickle')
# list_feats_x_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_asp2_20230813_20230903.pickle')
# list_feats_x_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_dc_20230813_20230903.pickle')
# list_feats_x_all = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_fc_rename_20230813_20230903.pickle')

list_feats_x_all = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_xxpay_20240225_20240317.pickle')

print(len(list_feats_x_all))
list_feats_x_all[:10]

In [None]:
list_feats_x_pi = [x for x in list_feats_x_all if x in list_feats_x]
print(len(list_feats_x_pi))

# utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_fusion_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_asp1_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_asp2_20230813_20230903.pickle')
# utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_dc_20230813_20230903.pickle')
utils.save_pickle(list_feats_x_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_fc_rename_20230813_20230903.pickle')

In [None]:
# 新特征部分筛选
list_feats_x_asp1_change = [
    '
]
print(len(list_feats_x_asp1_change))

list_feats_x_pi = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_20230813_20230903.pickle')

list_feats_x_asp1_change_pi = [x for x in list_feats_x_pi if x in list_feats_x_asp1_change]
print(len(list_feats_x_asp1_change_pi))

# utils.save_pickle(list_feats_x_asp1_change_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_pi_asp1_change_20230813_20230903.pickle')
utils.save_pickle(list_feats_x_asp1_change_pi, 'data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_change_20230813_20230903.pickle')

In [None]:
aaa = [
    
]
bbb = utils.load_pickle('data/other/increment_balance_24/list_feats/list_feats_x_ht_asp1_change_20230813_20230903.pickle')
list_feats_x_del = [x for x in aaa if x not in bbb]
print(len(list_feats_x_del))
list_feats_x_del[:10]

In [None]:
list_feats_x_del = [x for x in list_feats_x_all if x not in list_feats_x]
list_feats_x_del = ['asd'] if len(list_feats_x_del) == 0 else list_feats_x_del
print(len(list_feats_x_del))
list_feats_x_del[:10]

In [None]:
# sql_file_path = 'sample_label_feature_fusion_new.sql'

# sql_file_path = 'sample_label_feature_aspiration_new_part1.sql'
# sql_file_path = 'sample_label_feature_aspiration_new_part1_inc.sql'
# sql_file_path = 'sample_label_feature_aspiration_new_part1_inc_ht.sql'

# sql_file_path = 'sample_label_feature_aspiration_new_part2.sql'

# sql_file_path = 'sample_label_feature_debit_card.sql'

# sql_file_path = 'sample_label_feature_flow_cashier.sql'

sql_file_path = 'sample_label_feature_xxpay.sql'

special_words = list_feats_x_del

# sql_file_path_new = 'sample_label_feature_fusion_new_.sql'
# sql_file_path_new = 'sample_label_feature_fusion_new_inc_.sql'

# sql_file_path_new = 'sample_label_feature_aspiration_new_part1_.sql'
# sql_file_path_new = 'sample_label_feature_aspiration_new_part1_inc_.sql'

# sql_file_path_new = 'sample_label_feature_aspiration_new_part2_.sql'
# sql_file_path_new = 'sample_label_feature_aspiration_new_part2_inc_.sql'
# sql_file_path_new = 'sample_label_feature_aspiration_new_part1_inc_ht_.sql'

# sql_file_path_new = 'sample_label_feature_debit_card_inc_.sql'

# sql_file_path_new = 'sample_label_feature_flow_cashier_.sql'
# sql_file_path_new = 'sample_label_feature_flow_cashier_inc_.sql'

sql_file_path_new = 'sample_label_feature_xxpay_.sql'

del_sql_coalesce(sql_file_path, special_words, sql_file_path_new)
del_sql_select_origin(sql_file_path_new, special_words, sql_file_path_new)
correct_end(sql_file_path_new, sql_file_path_new)