# Release note

全新套路，https://github.com/jiahengqi/datacastle_shixin/blob/master/runner.py

In [1]:
import pickle,os
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import trange
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
def score(data,model):
    y_pred=model.predict_proba(data[feat])[:, 1]
    return roc_auc_score(data.Label, y_pred)

In [3]:
# data_train = pd.read_csv('originalDataset/train.csv')
# data_test_a = pd.read_csv('originalDataset/test.csv')

train = pd.read_csv('originalDataset/train.csv')
train_y = train.label
test = pd.read_csv('originalDataset/test.csv')

In [4]:
featuresToDiscard = ["HYZK", "ZHIYE", "ZHICHEN", "ZHIWU", "XUELI"]
train.drop(featuresToDiscard, axis = 1, inplace = True)
test.drop(featuresToDiscard, axis = 1, inplace = True)

In [5]:
## 过滤类别较少的和较多的列
def get_numerical_serial_fea(data,feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 40:
            numerical_noserial_fea.append(fea)
            continue
        numerical_serial_fea.append(fea)
    return numerical_serial_fea,numerical_noserial_fea
    
serial_fea_, categorical_fea_ = get_numerical_serial_fea(train,train.columns)
categorical_fea = list(filter(lambda x: x not in ["DKLL", "label"], categorical_fea_))
serial_fea = list(filter(lambda x: x not in ["id", "CSNY"], serial_fea_))
serial_fea.append("DKLL")
print("serial features: ", serial_fea)
print("categorical features: ", categorical_fea)

serial features:  ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE', 'DWYJCE', 'DKFFE', 'DKYE', 'DKLL']
categorical features:  ['XINGBIE', 'DWJJLX', 'DWSSHY', 'GRZHZT']


In [6]:
cat_feat = categorical_fea #['登记机关', '行业代码', '行业门类', '企业类型']
feat = serial_fea #list(set(train.columns)-set(train.select_dtypes(object))-set(['Label', 'ID'])-set(cat_feat))
remove_col = []
for col in feat:
    if train[col].nunique() < 2:
        remove_col.append(col)
feat = list(set(feat) - set(remove_col))

In [7]:
cat_feat

['XINGBIE', 'DWJJLX', 'DWSSHY', 'GRZHZT']

In [8]:
feat

['GRZHDNGJYE',
 'GRYJCE',
 'DKFFE',
 'DKYE',
 'DWYJCE',
 'GRZHSNJZYE',
 'GRZHYE',
 'GRJCJS',
 'DKLL']

In [9]:
class Feat:
    def __init__(self, config):
        self.config = config
    
    def fit(self, x, y):
        pass

    def transform(self, x):
        pass
    
    def fit_transform(self, x, y):
        self.fit(x, y)
        self.transform(x)
        
class CatCount(Feat):
    def transform(self, x):
        for col in self.config['cat_columns']:
            df_count = x[col].value_counts()
            x[f'{col}_catcount'] = x[col].map(df_count)

class CatCountRank(Feat):
    def fit(self, x, y):
        self.fit_dict = {}
        for col in self.config['cat_columns']:
            counter =  Counter(x[col]).most_common()
            self.fit_dict[col] = {k: i for (i, (k, v)) in enumerate(counter)}

    def transform(self, x):
        for col in self.config['cat_columns']:
            x[f'{col}_countrank'] = x[col].map(self.fit_dict[col])

In [10]:
config = {}
config['cat_columns'] = cat_feat

In [12]:
categorical_fea_index = []
for i, col in enumerate(train.columns):
    if col in cat_feat:
        categorical_fea_index.append(i)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_child_weight': 5,
    'num_leaves': 2 ** 5,
    'lambda_l2': 10,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4,
    'learning_rate': 0.1,
    'seed': 2020,
    'nthread': 28,
    'n_jobs':24,
    'verbose': -1,

    ## 
    "categorical_feature": categorical_fea_index, #"name:{}".format(",".join(categorical_fea)),

    #########
#                 'silent': True,
    # 'metric': 'auc',
}

In [14]:
if not os.path.exists("trans_data"):
    os.makedirs("trans_data")

importanceThreshold = 500
    
if os.path.exists('trans_data/train_1000_10.pkl'):
    train = pickle.load(open('trans_data/train_1000_10.pkl', 'rb'))
    test = pickle.load(open('trans_data/test_1000_10.pkl', 'rb'))
else:
    d={'add':'+', 'sub':'-', 'mul':'*', 'div':'/'}
    feat0 = feat.copy()
    for i in trange(len(feat)):
        df_temp=train[feat0].copy()
        for j in range(i+1,len(feat)):
#             print('%s|%s|add'%(feat[i],feat[j]))
            df_temp['%s|%s|add'%(feat[i],feat[j])] = train[feat[i]]+train[feat[j]]
            df_temp['%s|%s|sub'%(feat[i],feat[j])] = train[feat[i]]-train[feat[j]]
            df_temp['%s|%s|mul'%(feat[i],feat[j])] = train[feat[i]]*train[feat[j]]
            df_temp['%s|%s|div'%(feat[i],feat[j])] = train[feat[i]]/train[feat[j]]
        model = LGBMClassifier(
            **params, #n_estimators=1000, learning_rate=0.08, max_depth=7, subsample=0.8, colsample_bytree=0.6, n_jobs=4
        )
        model.fit(df_temp.values, train_y)
        qq = pd.Series(model.feature_importances_, index=df_temp.columns).sort_values()
        ## 特殊处理重要程度大于100的特征。卧槽，太帅了。
        for col in set(qq.loc[qq>importanceThreshold].index)-set(feat0):
            f0, f1, f2 = col.split('|')
            train[col] = df_temp[col]
            ## 如果这个特征靠谱，就把这个特征重复做到test数据集里面。这种操作手法太6了。
            test[col] = eval("test['%s'] %s test['%s']"%(f0, d[f2], f1))
        feat0.extend(list(set(qq.loc[qq>importanceThreshold].index)-set(feat0)))
    pickle.dump(train, open('trans_data/train_1000_10.pkl','wb'))
    pickle.dump(test, open('trans_data/test_1000_10.pkl','wb'))

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))




Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.00it/s]


In [15]:
def gen_feat(data):
    for col in cat_feat:
        data[col] = data[col].fillna('empty').astype(str)
    for col in data.columns:
        if '年' not in col and '|' not in col and data[col].isna().sum()>0:
            data['%s_na'%col] = data[col].isna().astype(int)
            
    featgen = CatCount(config)
    featgen.transform(data)
    
gen_feat(train)
gen_feat(test)

In [16]:
# train.shape

(40000, 134)

In [17]:
featgen = CatCountRank(config)
featgen.fit_transform(train, train_y)
featgen.transform(test)

In [18]:
# train[cat_feat].head()

Unnamed: 0,XINGBIE,DWJJLX,DWSSHY,GRZHZT
0,1,150,12,1
1,2,110,0,1
2,1,150,9,1
3,1,150,7,1
4,2,900,14,1


In [19]:
feat0 = list(set(train.columns)-set(train.select_dtypes(object))-set(['label','id'])-set(cat_feat))
remove_col = []
for col in feat0:
    if train[col].nunique() < 2:
        remove_col.append(col)
feat0 = list(set(feat0) - set(remove_col))
feat0

['GRYJCE|GRJCJS|mul',
 'GRYJCE|DKYE|sub',
 'GRYJCE|DKLL|add',
 'GRZHSNJZYE|GRZHYE|div',
 'DKFFE|GRZHYE|mul',
 'DKFFE|DKLL|div',
 'DWYJCE',
 'GRYJCE|GRZHSNJZYE|div',
 'GRYJCE|DKYE|mul',
 'GRZHYE|GRZHDNGJYE|add',
 'GRJCJS|DKYE|mul',
 'GRZHSNJZYE|DKYE|sub',
 'DKFFE|GRZHYE|div',
 'GRJCJS|DKYE|sub',
 'GRZHSNJZYE|DKFFE|div',
 'DKFFE|DKYE|sub',
 'GRJCJS|DKFFE|div',
 'GRZHYE|DKYE|sub',
 'GRJCJS|GRZHSNJZYE|add',
 'GRYJCE|GRZHYE|add',
 'GRJCJS|GRZHSNJZYE|mul',
 'GRZHSNJZYE|DKLL|div',
 'DKFFE|GRZHDNGJYE|div',
 'GRZHSNJZYE|DWYJCE|mul',
 'GRYJCE|DKFFE|mul',
 'DKFFE|DKYE|add',
 'GRZHSNJZYE|DKYE|add',
 'GRZHZT_catcount',
 'GRZHYE|GRZHDNGJYE|div',
 'DKLL|GRZHDNGJYE|mul',
 'DKLL|DWYJCE|div',
 'GRZHDNGJYE',
 'GRJCJS|DWYJCE|div',
 'GRJCJS|GRZHYE|mul',
 'GRJCJS|DKFFE|sub',
 'GRYJCE|DKLL|sub',
 'GRZHSNJZYE|GRZHDNGJYE|sub',
 'DKYE|GRZHDNGJYE|mul',
 'DKLL|GRZHDNGJYE|sub',
 'GRZHSNJZYE|DKYE|div',
 'DKYE|GRZHDNGJYE|div',
 'GRZHSNJZYE|GRZHYE|add',
 'DKFFE|GRZHDNGJYE|sub',
 'GRYJCE|GRZHDNGJYE|sub',
 'GRJCJS|GRZH

In [20]:
def tpr_weight_funtion(y_true, y_predict):
    '''
    这是一个通用的计算最终分数的函数。
    '''
    d = pd.DataFrame()
    d['y'] = list(y_true)
    d['prob'] = list(y_predict)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [21]:
def tpr_weight_funtion_lgb(y_true, y_predict):
    '''
    如果是给lgbm用的，参考https://github.com/microsoft/LightGBM/blob/c02917e493c36f3b1e349338f1087fed33126576/examples/python-guide/advanced_example.py#L154
    第一个返回值，是这个函数的可以说是名字或者是标记吧；score就是得到的分数；最后一个就是问，score是越高越好吗。
    '''
#     y_predict = pred
#     y_true = train_data #.get_label()
    
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    
    return "Weighted_Score", (0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3), True

######################################################################
kf = StratifiedKFold(5,True,random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values

cv_scores = []

for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print('************************************ {} ************************************'.format(str(idx+1)))

    train_data = train.loc[train_index][feat0].values
    valid_data = train.loc[valid_index][feat0].values
    model = LGBMClassifier(
        **params,
#         n_estimators=10000, 
#         learning_rate=0.08, 
#         num_leaves=15, 
#         subsample=0.8, 
#         colsample_bytree=0.6, 
#         n_jobs=4,
#         categorical_feature = categorical_fea_index, 
    )
    model.fit(
        train_data, train_y.loc[train_index], 
        eval_set=(valid_data, train_y.loc[valid_index]), 
        early_stopping_rounds=200,
        verbose=200,
        eval_metric = tpr_weight_funtion_lgb
    )
    pred = model.predict_proba(valid_data)[:,1]
    prob[valid_index] = pred
#     prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    
    cv_scores.append(tpr_weight_funtion(train_y.iloc[valid_index], pred))
    print(np.mean(cv_scores), cv_scores)
    
    test_prob += model.predict_proba(test_data)[:, 1]/kf.n_splits

************************************ 1 ************************************


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[61]	valid_0's binary_logloss: 0.137295	valid_0's Weighted_Score: 0.408877
0.40887681159420286 [0.40887681159420286]
************************************ 2 ************************************


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[68]	valid_0's binary_logloss: 0.141404	valid_0's Weighted_Score: 0.35308
0.3809782608695652 [0.40887681159420286, 0.35307971014492756]
************************************ 3 ************************************


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[61]	valid_0's binary_logloss: 0.147347	valid_0's Weighted_Score: 0.347731
0.3698959730660985 [0.40887681159420286, 0.35307971014492756, 0.34773139745916515]
************************************ 4 ************************************


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[70]	valid_0's binary_logloss: 0.140055	valid_0's Weighted_Score: 0.357713
0.36685029195928354 [0.40887681159420286, 0.35307971014492756, 0.34773139745916515, 0.35771324863883847]
************************************ 5 ************************************


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[63]	valid_0's binary_logloss: 0.143421	valid_0's Weighted_Score: 0.38294
0.3700682553460112 [0.40887681159420286, 0.35307971014492756, 0.34773139745916515, 0.35771324863883847, 0.38294010889292196]


In [22]:
train['lgb_prob'] = prob
test['lgb_prob'] = test_prob

In [25]:
class score_forCAT(object):
    '''
    https://catboost.ai/docs/concepts/python-usages-examples.html#custom-loss-function-eval-metric
    '''
    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):

        ## weight没什么迪奥用。
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        
        d = pd.DataFrame()
        d['prob'] = list(approxes[0])
        d['y'] = list(target)
        d = d.sort_values(['prob'], ascending=[0])
        y = d.y
        PosAll = pd.Series(y).value_counts()[1]
        NegAll = pd.Series(y).value_counts()[0]
        pCumsum = d['y'].cumsum()
        nCumsum = np.arange(len(y)) - pCumsum + 1
        pCumsumPer = pCumsum / PosAll
        nCumsumPer = nCumsum / NegAll
        TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
        TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
        TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
        
        score = (0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3)

        return score, -1
    
    def get_final_error(self, error, weight):
        # Returns final value of metric based on error and weight
        return error

kf = StratifiedKFold(5, True, random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))

feat1=list(set(feat0 + cat_feat + ['lgb_prob']))

cv_scores = []

test_data=test[feat1].values
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
#     print(list(train_y.iloc[train_index]))
    print('************************************ {} ************************************'.format(str(idx+1)))
    train_data = train.loc[train_index][feat1]
    valid_data = train.loc[valid_index][feat1]
    
#     params_CAT = {
#         'learning_rate': 0.1, 
#         'depth': 12, ## 曾经测试过15，但是一旦设置到15，就会奇慢无比
#         'l2_leaf_reg': 80, #100 #50 # 设为20，能够达到40xx的分数。 
#         'bootstrap_type': "Bernoulli", ## "Bayesian"比Bernoulli 略强点. 然后我们试试Bernoulli和subsample参数的组合吧。
#         # "bagging_temperature": 2, ## 这个参数要跟bayesian bootstrap方法组合起来用。
#         "subsample": 0.7, ## 窝槽，bernoulli+subsample加起来很吊啊。。。
#         # "sampling_frequency": "PerTree", ## 这个设了没设一样。

#         "grow_policy": "Depthwise", ## 这个有奇效。应该还有潜力可挖。
#         "min_data_in_leaf": 2, ## 这个也有效果。默认是1，我估计这样搞就太精确了，容易导致过拟合。所以把区间放大一点效果反倒好。
#         ## 上面两个，能够达到4236的分数。

#         ## 如果要用这个参数，不要弄onehot。都交给catboost吧。
#         "one_hot_max_size": 255, ## 能提高一点点。0.0002吧。

#         ## 设置的大了，性能可能会下降。原档案警告如上。
#         "fold_permutation_block": 2, ## 可能有效。可以关注一下。 设为2，达到了4250

#         ## Balanced跑了两次，最高得到了4286的结果。不太稳定。
#         "auto_class_weights": "SqrtBalanced",## 得到了4484，升级明显，多测几次，看看是不是偶然 

#         ## 这个参数酌情不设吧。因为这个的作用应该跟StratifiedKFold一样吧。
#         "allow_const_label": True,

#         'od_type': 'Iter', 
#         'od_wait': 300, 

#         ## MinEntropy和默认值比较好，其他的比较拉垮。
#         "feature_border_type": "MinEntropy", 

#         'random_seed': 11, 
#         'allow_writing_files': False, 
# #         "task_type": "GPU",
#     }
    params_CAT = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False, 
#                      "task_type": "GPU",
                     }
    model = CatBoostClassifier(
        **params_CAT, 
        #iterations=10000, learning_rate=0.08, depth=7, 
        cat_features=cat_feat, 
        eval_metric= score_forCAT(), use_best_model= True
    )
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50, 
             verbose=500
#              eval_metric = score_forCAT()
             )
    pred = model.predict_proba(valid_data)[:,1]
    prob[valid_index] = pred
    test_prob += model.predict_proba(test_data)[:,1]/5
    
    cv_scores.append(tpr_weight_funtion(train_y.iloc[valid_index], pred))
    print(np.mean(cv_scores), cv_scores)



************************************ 1 ************************************
0:	learn: 0.2371429	test: 0.2702899	best: 0.2702899 (0)	total: 746ms	remaining: 12m 25s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4432971014
bestIteration = 259

Shrink model to first 260 iterations.
0.44329710144927537 [0.44329710144927537]
************************************ 2 ************************************
0:	learn: 0.2759184	test: 0.2168478	best: 0.2168478 (0)	total: 733ms	remaining: 12m 11s


KeyboardInterrupt: 

In [None]:
test['label'] = test_prob
if not os.path.exists("output"):
    os.makedirs("output")
test[['id', 'label']].to_csv('output/1215_count_rank.csv', index=False)