In [1]:
import itertools
import random
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing 
from sklearn import metrics
from sklearn.model_selection import cross_val_score

## 查看数据

In [2]:
train_data = pd.read_excel("./数据集/train.xlsx",index_col=False)
test_data = pd.read_excel("./数据集/test_B榜.xlsx",index_col=False)

In [3]:
# 查看标签分布
train_data["LABEL"].value_counts(normalize=True)

0    0.75
1    0.25
Name: LABEL, dtype: float64

In [4]:
# 查看数据分布，发现有很多问号，应该也是缺失值的一种
train_data.head(5)

Unnamed: 0,CUST_UID,LABEL,AGN_CNT_RCT_12_MON,ICO_CUR_MON_ACM_TRX_TM,NB_RCT_3_MON_LGN_TMS_AGV,AGN_CUR_YEAR_AMT,AGN_CUR_YEAR_WAG_AMT,AGN_AGR_LATEST_AGN_AMT,ICO_CUR_MON_ACM_TRX_AMT,COUNTER_CUR_YEAR_CNT_AMT,...,WTHR_OPN_ONL_ICO,EMP_NBR,REG_CPT,SHH_BCK,HLD_DMS_CCY_ACT_NBR,REG_DT,LGP_HLD_CARD_LVL,OPN_TM,NB_CTC_HLD_IDV_AIO_CARD_SITU,HLD_FGN_CCY_ACT_NBR
0,2a171d461bf24a739b804c2843ad4f6e,0,?,72,425.3,?,?,?,25879985.3,2,...,B,2,1200000002,32,12,1708.45,?,416.84,?,2
1,c58d5848d18548e297963c2d9e092699,0,2282,222,?,11776572.4,?,1964626.4,14755499.8,2,...,B,1002,5000002,32,22,199.42,F,195.87,D,2
2,74e6d8179e784a039bcb5a722014f4a4,0,?,2,?,?,?,4599822.3,2,2,...,B,2,?,2,12,1135.55,F,1122.0,,2
3,1b6a8be2c2e34de09ca00d71470ec180,1,?,?,22,?,?,?,?,2,...,A,2,500002,12,12,98.45,,92.0,C,2
4,1974b5e0440a41128a416a8cdc8a7c16,1,?,?,?,?,?,?,?,?,...,A,2,?,2,12,?,,355.87,C,2


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 50 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   CUST_UID                                 12000 non-null  object
 1   AGN_CNT_RCT_12_MON                       12000 non-null  object
 2   ICO_CUR_MON_ACM_TRX_TM                   12000 non-null  object
 3   NB_RCT_3_MON_LGN_TMS_AGV                 12000 non-null  object
 4   AGN_CUR_YEAR_AMT                         12000 non-null  object
 5   AGN_CUR_YEAR_WAG_AMT                     12000 non-null  object
 6   AGN_AGR_LATEST_AGN_AMT                   12000 non-null  object
 7   ICO_CUR_MON_ACM_TRX_AMT                  12000 non-null  object
 8   COUNTER_CUR_YEAR_CNT_AMT                 12000 non-null  object
 9   PUB_TO_PRV_TRX_AMT_CUR_YEAR              12000 non-null  object
 10  MON_12_EXT_SAM_TRSF_IN_AMT               12000 non-null  o

### 数据类型

In [6]:
# 查看各特征的数据类型
fea_type = pd.read_excel("./数据集/特征说明.xlsx",index_col=False,header=1)

fea_type.index = fea_type["字段名称"]
fea_type.drop(columns=["字段名称"],inplace=True)

# 字符型数据
fea_type = fea_type.loc[fea_type.index!="CUST_UID"]
char_type_fea = fea_type[fea_type["字符类型"] == "字符型"].index

In [7]:
# 数值型数据
fea_type = fea_type.loc[fea_type.index!="LABEL"]
fea_type = fea_type.loc[fea_type.index!="CUR_MON_VAL_VLD_CUST_NED_HLD_YEAR_DAY_AVG"]
num_type_fea = fea_type[fea_type["字符类型"] == "数值型"].index

### 缺失分布

In [8]:
# 原始特征缺失值分布
train_data.isnull().mean()[train_data.isnull().mean()>0] 

MON_12_CUST_CNT_PTY_ID          0.570750
WTHR_OPN_ONL_ICO                0.031200
LGP_HLD_CARD_LVL                0.318000
NB_CTC_HLD_IDV_AIO_CARD_SITU    0.070025
dtype: float64

In [9]:
train_data = train_data.fillna(1)

In [10]:
# 将问题replace为np.nan值后查看各特征问号的占比
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？

# 替换后特征缺失值分布
miss_fea = new_train_data.isnull().mean()[new_train_data.isnull().mean()>0.1].index
print(new_train_data.isnull().mean()[new_train_data.isnull().mean()>0.1])

AGN_CNT_RCT_12_MON                  0.535200
AGN_CUR_YEAR_AMT                    0.563525
AGN_CUR_YEAR_WAG_AMT                0.699625
AGN_AGR_LATEST_AGN_AMT              0.489825
MON_12_TRX_AMT_MAX_AMT_PCTT         0.240000
CUR_YEAR_PUB_TO_PRV_TRX_PTY_CNT     0.503400
MON_6_50_UP_ENTR_ACT_CNT            0.165500
MON_6_50_UP_LVE_ACT_CNT             0.165500
MON_12_ACT_OUT_50_UP_CNT_PTY_QTY    0.146500
MON_12_ACT_IN_50_UP_CNT_PTY_QTY     0.146500
LGP_HLD_CARD_LVL                    0.100800
NB_CTC_HLD_IDV_AIO_CARD_SITU        0.100800
dtype: float64


### 重复值查看

In [11]:
train_data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
39995    False
39996    False
39997    False
39998    False
39999    False
Length: 40000, dtype: bool

## 数据无量纲

In [12]:
### log

def data_log(num_type_fea,data,num=1):
    
    new_train_data = data 
    for col in num_type_fea:
        if min(new_train_data.loc[:,col]+num-1)>=0:
            new_train_data.loc[:,col]  = np.log(new_train_data.loc[:,col]+num)
        else:
            values = new_train_data.loc[:,col].values 
            new_values = []
            for value in values:
                if value >=0:
                    new_values.append(np.log(value+num))
                else:
                    new_values.append(-np.log(np.abs(value)+num))
            new_train_data.loc[:,col] = pd.DataFrame(new_values,columns=[col])   
    return new_train_data

### one-hot

def One_Hot(char_type_fea,train_X,test_X,usetype="train"):

    from sklearn.preprocessing import OrdinalEncoder
    
    if usetype == "train":
        for i in char_type_fea:
            train_X.loc[:,i] = OrdinalEncoder().fit_transform(train_X.loc[:,i].values.reshape(-1, 1)) 
        return train_X
        
    else:
        for i in char_type_fea:
            transformer = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)\
                                    .fit(train_X.loc[:,i].values.reshape(-1, 1))
            test_X.loc[:,i] = transformer.transform(test_X.loc[:,i].values.reshape(-1,1))
        return test_X

### 特征构造

def GroupbyFeature(column,train,test,newcolumn):
    
    # 使用训练集创建map用的字典，方便以后遇到新的数据后，也能进行与训练集相同的处理；
    # 相当于构建以个函数的映射关系
    dic_f = dict(train.groupby(column)['LABEL'].mean())
    
    # 建立新特征
    train[newcolumn] = train[column].map(dic_f)
    test[newcolumn] = test[column].map(dic_f)
    
    return train,test     


## 预测结果存储

In [13]:
def get_result(predictor,new_test_data,test_data):
    pd.set_option('precision', 10) #显示3位
    y_predproba = predictor.predict_proba(new_test_data, as_multiclass=False)
    submit = pd.concat([test_data["CUST_UID"],y_predproba],axis=1)

    from datetime import datetime
    now = datetime.now()
    day_now = datetime.strftime(now,'%Y-%m-%d')
    time_now = datetime.strftime(now,"%H_%M_%S")

    Folder_path = "./testB/{a}/".format(a=day_now)
    if not os.path.exists(Folder_path):
        os.makedirs(Folder_path)
    submit.to_csv(Folder_path+"submission{time}.txt".format(time=time_now)
                  ,index=False,header=False,encoding='utf-8',sep='\t',float_format='%.10f') # 保存的历史文件

## 模型融合

### 已有模型预测

In [15]:
# 训练数据缺失值处理
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？
new_train_data.dropna(how="all",inplace=True)
new_train_data = new_train_data.fillna(0)
new_train_data = new_train_data.drop(columns=["CUST_UID"])
new_train_data = data_log(num_type_fea,new_train_data,num=1)

# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

In [18]:
### auto-weight
from autogluon.tabular import TabularDataset, TabularPredictor

save_path = r"F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220509_010141"
predictor = TabularPredictor.load(save_path) 
get_result(predictor,new_test_data,test_data)

In [46]:
### modal
from autogluon.tabular import TabularDataset, TabularPredictor
save_path = r"F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028"
predictor = TabularPredictor.load(save_path) 
get_result(predictor,new_test_data,test_data)

Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\predictor.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\learner.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\trainer.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\WeightedEnsemble_L2\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\CatBoost_BAG_L1\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\LightGBMLarge_BAG_L1\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\LightGBMXT_BAG_L1\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\LightGBM_BAG_L1\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\NeuralNetTorch_BAG_L1\model.pkl
Loading: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\TextPredictor_BAG_L1\model.pkl
Load pretrained checkp

Predicting: 100%|████████████████████████████████████████████████████████████████████| 188/188 [00:47<00:00,  3.97it/s]


Load pretrained checkpoint: F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220507_152028\models\TextPredictor_BAG_L1\S1F2\text_nn\model.ckpt


Predicting:   9%|██████▏                                                              | 15/188 [00:12<02:20,  1.23it/s]

### 加入A榜数据

In [31]:
save_path = r"F:\Anaconda\Kaggle\招商银行\AutogluonModels\ag-20220509_010141"
predictor = TabularPredictor.load(save_path)

In [33]:
# 训练集处理

test_A = pd.read_excel("./数据集/test_A榜.xlsx",index_col=False)

# A榜数据处理
test_A.isnull().mean()[test_A.isnull().mean()>0]
test_A = test_A.fillna(1)

new_test_A = test_A.replace('?',np.nan)#用np.nan替换？
new_test_A = new_test_A.fillna(0)
new_test_A = new_test_A.drop(columns=["CUST_UID"])
new_test_A = data_log(num_type_fea,new_test_A,num=1)

predictions = predictor.predict(new_test_A)

A_LABEL = pd.DataFrame(predictions,columns=["LABEL"])
new_train_A = pd.concat([A_LABEL,new_test_A],axis=1)

new_train_B = pd.concat([new_train_data,new_train_A],axis=0)
new_train_B.index = range(new_train_B.shape[0])

In [95]:
new_train_data.to_excel("./数据集/new_train_data.xlsx",index=False)
new_train_B.to_excel("./数据集/new_train_B.xlsx",index=False)
new_test_data.to_excel("./数据集/new_test_data.xlsx",index=False)

In [37]:
# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

In [45]:
from autogluon.tabular import TabularDataset, TabularPredictor

label = "LABEL"
eval_metric = 'roc_auc'
# predictor = TabularPredictor(label= label).fit(train_data=train_data,
#                                                num_bag_folds=5, num_bag_sets=1, num_stack_levels=2,
#                                                ag_args_fit={'num_gpus': 0})

predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=3
#                              ,sample_weight = "balance_weight"
                             ,sample_weight = "auto_weight"   
                            )\
                            .fit(
                                    new_train_B
                                    ,ag_args_fit={'num_gpus': 1}
#                                     ,hyperparameters = 'multimodal'                                  
                                    , presets='best_quality'
                                    , time_limit=3600*5
                                )
get_result(predictor,new_test_data,test_data)

auto_weight currently does not use any sample weights.
No path specified. Models will be saved in: "AutogluonModels/ag-20220510_020337\"
Presets specified: ['best_quality']
User Specified kwargs:
{'ag_args_fit': {'num_gpus': 1}, 'auto_stack': True}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': {'num_gpus': 1},
 'auto_stack': True,
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': None,
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': None,
 'num_bag_sets': None,
 'num_stack_levels': None,
 'pseudo_data': None,
 'quantile_levels': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 3}
Saving AutogluonModels/ag-20220510_020337\learner.pkl
Saving AutogluonModels/ag-202205

	Types of features in original data (raw dtype, special dtypes):
		('float', [])                      : 45 | ['AGN_CNT_RCT_12_MON', 'ICO_CUR_MON_ACM_TRX_TM', 'NB_RCT_3_MON_LGN_TMS_AGV', 'AGN_CUR_YEAR_AMT', 'AGN_CUR_YEAR_WAG_AMT', ...]
		('object', [])                     :  2 | ['WTHR_OPN_ONL_ICO', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']
		('object', ['datetime_as_object']) :  2 | ['MON_12_CUST_CNT_PTY_ID', 'LGP_HLD_CARD_LVL']
	Types of features in processed data (exact raw dtype, raw dtype):
		('category', 'category') :  2 | ['WTHR_OPN_ONL_ICO', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']
		('float64', 'float')     : 45 | ['AGN_CNT_RCT_12_MON', 'ICO_CUR_MON_ACM_TRX_TM', 'NB_RCT_3_MON_LGN_TMS_AGV', 'AGN_CUR_YEAR_AMT', 'AGN_CUR_YEAR_WAG_AMT', ...]
		('int64', 'int')         :  2 | ['MON_12_CUST_CNT_PTY_ID', 'LGP_HLD_CARD_LVL']
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])             :  2 | ['WTHR_OPN_ONL_ICO', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']
		('float', [])          

### 特征选择

In [124]:
# 训练数据缺失值处理
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？
new_train_data.dropna(how="all",inplace=True)
new_train_data = new_train_data.fillna(0)
new_train_data = new_train_data.drop(columns=["CUST_UID"])
new_train_data = data_log(num_type_fea,new_train_data,num=1)

# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

# 特征构造
for col in char_type_fea:
    new_col = col+'_ratio'
    new_train_data,new_test_data = GroupbyFeature(col,new_train_data,new_test_data,new_col) 

In [88]:
# 计算可解释性特征重要度
from autogluon.tabular import TabularDataset, TabularPredictor
label = "LABEL"
eval_metric = 'roc_auc'
predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=0)\
                            .fit(
                                    new_train_data
                                    ,ag_args_fit={'num_gpus': 1}
                                )

fea_importance = predictor.feature_importance(new_train_data)
fea_importance .to_excel("./fea_importance.xlsx")



In [89]:
fea_importance = pd.read_excel(r"./fea_importance.xlsx",index_col=False)
fea_importance.index = fea_importance["Unnamed: 0"]
fea_importance.drop(columns=["Unnamed: 0"],inplace=True)

In [94]:
# 基于特征重要度排名，进行前向选择

cols = ["LABEL"]
score_val = []
model_name = []
for col in fea_importance.index:
    cols.append(col)
    new_train_fea = new_train_data[cols]

    label = "LABEL"
    eval_metric = 'roc_auc'
    predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=0)\
                                .fit(
                                        new_train_fea
                                        ,ag_args_fit={'num_gpus': 1}
                                    )
    score_val.append(predictor.leaderboard(silent=True)["score_val"][0])
    model_name.append(predictor.leaderboard(silent=True)["model"][0])



In [115]:
a = pd.DataFrame(score_val,columns=["score_val"])
b = pd.DataFrame(model_name,columns=["model_name"])
pd.concat([a,b],axis=1).to_excel("./score_val.xlsx",index=False)

### 普通TabularPredictor

In [20]:
fea_importance = pd.read_excel(r"./fea_importance.xlsx",index_col=False)
fea_importance.index = fea_importance["Unnamed: 0"]
fea_importance.drop(columns=["Unnamed: 0"],inplace=True)

score_val = pd.read_excel("./score_val.xlsx",index_col=False)

In [42]:
# 训练数据缺失值处理
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？
new_train_data.dropna(how="all",inplace=True)
new_train_data = new_train_data.fillna(0)
new_train_data = new_train_data.drop(columns=["CUST_UID"])
new_train_data = data_log(num_type_fea,new_train_data,num=1)

# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

# 特征构造
for col in char_type_fea:
    new_col = col+'_ratio'
    new_train_data,new_test_data = GroupbyFeature(col,new_train_data,new_test_data,new_col) 

In [48]:
fea_num = 5 
train_col = fea_importance.index.tolist()[:fea_num]+["LABEL"]
test_col = fea_importance.index.tolist()[:fea_num]
new_train_data = new_train_data[train_col]
new_test_data = new_test_data[test_col]

In [49]:
from autogluon.tabular import TabularDataset, TabularPredictor

label = "LABEL"
eval_metric = 'roc_auc'
# predictor = TabularPredictor(label= label).fit(train_data=train_data,
#                                                num_bag_folds=5, num_bag_sets=1, num_stack_levels=2,
#                                                ag_args_fit={'num_gpus': 0})

predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=0
                            ,sample_weight = "balance_weight")\
                            .fit(
                                    new_train_data
                                    ,ag_args_fit={'num_gpus': 1}
#                                     ,hyperparameters = 'multimodal',num_stack_levels=1,num_bag_folds=5                                   
#                                     , presets='best_quality'
#                                     , time_limit=3600*13
                                )

get_result(predictor,new_test_data,test_data)

KeyboardInterrupt: 

### mutimodal + subset feature

In [14]:
fea_importance = pd.read_excel(r"./fea_importance.xlsx",index_col=False)
fea_importance.index = fea_importance["Unnamed: 0"]
fea_importance.drop(columns=["Unnamed: 0"],inplace=True)

score_val = pd.read_excel("./score_val.xlsx",index_col=False)

In [51]:
# 训练数据缺失值处理
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？
new_train_data.dropna(how="all",inplace=True)
new_train_data = new_train_data.fillna(0)
new_train_data = new_train_data.drop(columns=["CUST_UID"])
new_train_data = data_log(num_type_fea,new_train_data,num=1)

# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

# 特征构造
for col in char_type_fea:
    new_col = col+'_ratio'
    new_train_data,new_test_data = GroupbyFeature(col,new_train_data,new_test_data,new_col) 

In [31]:
fea_num = 15 
train_col = fea_importance.index.tolist()[:fea_num]+["LABEL"]
test_col = fea_importance.index.tolist()[:fea_num]
new_train_data = new_train_data[train_col]
new_test_data = new_test_data[test_col]

In [32]:
miss_num = 0
for col in fea_importance.index.tolist()[:fea_num]:
    if col in miss_fea:
        miss_num = miss_num+1
print(miss_num)


1


In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

label = "LABEL"
eval_metric = 'roc_auc'
# predictor = TabularPredictor(label= label).fit(train_data=train_data,
#                                                num_bag_folds=5, num_bag_sets=1, num_stack_levels=2,
#                                                ag_args_fit={'num_gpus': 0})

predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=3
                            ,sample_weight = "balance_weight")\
                            .fit(
                                    new_train_data
                                    ,ag_args_fit={'num_gpus': 1}
                                    ,hyperparameters = 'multimodal',num_stack_levels=1,num_bag_folds=5                                   
#                                     , presets='best_quality'
#                                     , time_limit=3600*13
                                )

get_result(predictor,new_test_data,test_data)

### KBD + subset feature

In [14]:
fea_importance = pd.read_excel(r"./fea_importance.xlsx",index_col=False)
fea_importance.index = fea_importance["Unnamed: 0"]
fea_importance.drop(columns=["Unnamed: 0"],inplace=True)

score_val = pd.read_excel("./score_val.xlsx",index_col=False)

In [20]:
# 训练数据缺失值处理
new_train_data = train_data.replace('?',np.nan)#用np.nan替换？
new_train_data.dropna(how="all",inplace=True)
new_train_data = new_train_data.fillna(0)
new_train_data = new_train_data.drop(columns=["CUST_UID"])
new_train_data = data_log(num_type_fea,new_train_data,num=1)
# new_train_B = pd.read_excel("./数据集/new_train_B.xlsx")

# 测试数据缺失值处理
test_data.isnull().mean()[test_data.isnull().mean()>0]
test_data = test_data.fillna(1)

new_test_data = test_data.replace('?',np.nan)#用np.nan替换？
new_test_data = new_test_data.fillna(0)
new_test_data = new_test_data.drop(columns=["CUST_UID"])
new_test_data = data_log(num_type_fea,new_test_data,num=1)

# 特征构造
for col in char_type_fea:
    new_col = col+'_ratio'
    new_train_data,new_test_data = GroupbyFeature(col,new_train_data,new_test_data,new_col) 

In [21]:
fea_num = fea_importance.shape[0]
train_col = fea_importance.index.tolist()[:fea_num]+["LABEL"]
test_col = fea_importance.index.tolist()[:fea_num]
new_train_data = new_train_data[train_col]
new_test_data = new_test_data[test_col]

In [22]:
from sklearn.preprocessing import KBinsDiscretizer as KBD 

enc = KBD(
    n_bins=18
    ,encode='ordinal'
    ,strategy='kmeans'
)

for col in train_col :
    if col in num_type_fea:
        new_col = col+"_KBD"
        new_train_data[new_col] = enc.fit_transform(pd.DataFrame(new_train_data[col]))

for col in test_col:
    if col in num_type_fea:
        new_col = col+"_KBD"
        enc.fit(pd.DataFrame(new_train_data[col]))
        new_test_data[new_col] = enc.transform(pd.DataFrame(new_test_data[col]))

In [23]:
drop_fea = []
for col in num_type_fea:
    if col in train_col:
        drop_fea.append(col)
new_train_data.drop(columns=drop_fea,inplace=True)
new_test_data.drop(columns=drop_fea,inplace=True) 

In [24]:
from autogluon.tabular import TabularDataset, TabularPredictor

label = "LABEL"
eval_metric = 'roc_auc'
# predictor = TabularPredictor(label= label).fit(train_data=train_data,
#                                                num_bag_folds=5, num_bag_sets=1, num_stack_levels=2,
#                                                ag_args_fit={'num_gpus': 0})

predictor = TabularPredictor(label=label, eval_metric=eval_metric, verbosity=0
                             ,sample_weight = "balance_weight"
#                              ,sample_weight = "auto_weight"   
                            )\
                            .fit(
                                    new_train_data
                                    ,ag_args_fit={'num_gpus': 1}
#                                     ,hyperparameters = 'multimodal',num_stack_levels=1,num_bag_folds=5                                  
                                    , presets='best_quality'
                                    , time_limit=3600*5
                                )


		[36mray::_ray_fit()[39m (pid=4680, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 596, in ray._raylet.execute_task
  File "E:\Python\anaconda\envs\Pytorch\lib\site-packages\ray\_private\memory_monitor.py", line 158, in raise_if_low_memory
    self.error_threshold))
ray._private.memory_monitor.RayOutOfMemoryError: More than 95% of the memory on node LC-DIY2020UZBFD is used (15.9 / 15.92 GB). The top 10 memory consumers are:

PID	MEM	COMMAND
5448	1.58GiB	E:\Python\anaconda\envs\Pytorch\python.exe E:\Python\anaconda\envs\Pytorch\lib\site-packages\ray\wor
8612	1.01GiB	E:\Python\anaconda\envs\Pytorch\python.exe E:\Python\anaconda\envs\Pytorch\lib\site-packages\ray\wor
4196	0.7GiB	E:\Python\anaconda\envs\Pytorch\python.exe E:\Python\anaconda\envs\Pytorch\lib\site-packages\ray\wor
13248	0.6GiB	E:\Python\anaconda\envs\Pytorch\python.exe E:\Python\anaconda\envs\Pytorch\lib\site-packages\ray\wor
11152	0.28GiB	E:\Python\anaconda\envs\Pytorch\python.exe E:\Python\anaconda\envs\Pytorch\lib



In [25]:
predictor.fit_summary(verbosity=1)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.940178     268.265235  4367.967899                0.008977           7.818259            3       True         21
1       WeightedEnsemble_L2   0.940120     150.778687  2779.850537                0.009973           9.518566            2       True         11
2           CatBoost_BAG_L2   0.939940     507.944576  4410.162146              303.874515        1621.085564            2       True         16
3           LightGBM_BAG_L2   0.939923     220.794470  3105.623621               16.724409         316.547039            2       True         13
4         LightGBMXT_BAG_L2   0.939903     223.776563  3124.119667               19.706502         335.043086            2       True         12
5      LightGBMLarge_BAG_L2   0.939902     225.051263  3688.021961  

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L1': 'StackerEnsembleModel_XT',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L2': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L2': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L2': 'StackerEnsembleModel_XT',
  'XGBoost_BAG_L2': 'Stac

In [26]:
get_result(predictor,new_test_data,test_data)