# Release Notes

上一个版本：`3.4-FE-SomeAvgAndIncrementalProportion-ChangeHP_CAT, 2.0-EDA-1-forQuarter4`

我们从一开始就使用featuretools来搞。这样就要完全重新设计特征计算的思路了。

注意，这里的代码，最好只跑一次，把结果暂存本地之后，重复使用。

# Setting working directory

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/XiamenIntlBank')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/XiamenIntlBank'

Go to this place for original dataset: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl/originalDataset'`

In [None]:
## 安装catboost和lightgbm。
## catboost耗费显存极大，但是飞快；lightgbm好像不是那么的方便，gpu使用不起来。
# !pip install catboost

# !pip install featuretools

# Importing libraries and load datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, cohen_kappa_score
# import tqdm
from collections import Counter, defaultdict
import tensorflow as tf
import random
import featuretools as ft
import re, os
warnings.filterwarnings('ignore')

# Rudimentary processing

Like encoding some object type data

## Load original dataset

重新设计特征处理的方式。这种方式要分开加载各个子表。

In [2]:
def changeColName(df_origin, df_name):
    '''
    df_origin is data frame
    df_name is string, the name of the dataframe
    '''
    df = df_origin.copy()
    cols = df.columns
    newCols = []
    for col in cols:
        if col == "cust_no":
            newCol = col
        else:
            newCol = "{}_{}".format(df_name, col)
        newCols.append(newCol)
    df.columns = newCols
    return df

def changeColName_id(df_origin, df_name):
    df = df_origin.copy()
    df.rename(columns = {"cust_no": "cust_no-" + df_name}, inplace = True)
    return df

def transformTheDateToYearMonthDay(data, feas):
    ## 将日期列变成三个列：年，月，日
    for fea in feas: 
        print(fea)
        if fea in ["behavior_m3_B6"]: ## 这个列有时分秒，不只有日期。
            data[fea + "_DT"] = pd.to_datetime(data[fea],format = '%Y-%m-%d %H:%M:%S')
        else: ## 这些列，只有日期。
            data[fea + "_DT"] = pd.to_datetime(data[fea],format = '%Y-%m-%d')
        ## 
        data[fea + "_Year"] = featuretools.primitives.Year()(data[fea + "_DT"])
        ## 
        data[fea + "_Month"] = featuretools.primitives.Month()(data[fea + "_DT"])
        ## 
        data[fea + "_Day"] = featuretools.primitives.Day()(data[fea + "_DT"])
        ### 
        data.drop([fea, fea + "_DT"], axis = 1, inplace = True)
        
def transformTheDateToDaysFrom20000101(data, feas):
    ## 将日期转换为距离新千年第一天的距离
    startdate = datetime.datetime.strptime(
        "2000-01-01 00:00:00",
        '%Y-%m-%d %H:%M:%S'
    )
#     (data_train["behavior_m3_B6_DT"] - data_train[fea + "_DT"]).dt.days
    for fea in feas:   
        print(fea)
        data[fea] = data[fea].astype(str)
        if fea in ["behavior_m3_B6"]: ## 这个列有时分秒，不只有日期。
            data[fea] = pd.to_datetime(data[fea],format = '%Y-%m-%d %H:%M:%S')
            data[fea] = data[fea].apply(lambda x: x-startdate).dt.days
        else: ## 这些列，只有日期。
            data[fea] = pd.to_datetime(data[fea],format='%Y-%m-%d')
            data[fea] = data[fea].apply(lambda x: x-startdate).dt.days

In [3]:
## 这些数据，看样子都是数值型的

aum_m1 = changeColName(
    pd.read_csv('originalDataset/x_train/aum_train/aum_m7.csv'), 
    "aum_m1"
)
aum_m2 = changeColName(
    pd.read_csv('originalDataset/x_train/aum_train/aum_m8.csv'), 
    "aum_m2"
)
aum_m3 = changeColName(
    pd.read_csv('originalDataset/x_train/aum_train/aum_m9.csv'), 
    "aum_m3"
)
##########
aum_m = pd.merge(
    aum_m1, aum_m2, 
    how='outer',on='cust_no'
)

aum_m = pd.merge(
    aum_m, aum_m3, 
    how='outer',on='cust_no'
)
################
# aum_m = changeColName_id(aum_m, "aum_m")

In [4]:
## 这些东西，大多数也是数值类型的
## 每个季度的第一个月，第二个月，没有behavior_m(Y)数据，那就删掉这两个列。

behavior_m1 = changeColName(
    pd.read_csv('originalDataset/x_train/behavior_train/behavior_m7.csv'), 
    "behavior_m1"
)
behavior_m2 = changeColName(
    pd.read_csv('originalDataset/x_train/behavior_train/behavior_m8.csv'), 
    "behavior_m2"
)
behavior_m3 = changeColName(
    pd.read_csv('originalDataset/x_train/behavior_train/behavior_m9.csv'), 
    "behavior_m3"
)
##########
behavior_m = pd.merge(
    behavior_m1, behavior_m2, 
    how='outer',on='cust_no'
)

behavior_m = pd.merge(
    behavior_m, behavior_m3, 
    how='outer',on='cust_no'
)
################ ["behavior_m3_B6"]
transformTheDateToDaysFrom20000101(behavior_m3, ["behavior_m3_B6"])

# behavior_m3.head()
##################
# behavior_m = changeColName_id(behavior_m, "behavior_m")

behavior_m3_B6


In [5]:
## 这些东西，大多数也是数值类型的
## 每个季度的第一个月，第二个月，没有behavior_m(Y)数据，那就删掉这两个列。

big_event = changeColName(
    pd.read_csv('originalDataset/x_train/big_event_train/big_event_Q3.csv'),
    "big_event",
)

## 这边删掉了E11这个列。注意。别的表不见得这个列都是空值。
### 时间列：[col for col in big_event.columns if (col not in ["cust_no", "E11"] and "E11" not in col and "E15" not in col and "E17" not in col)]
transformTheDateToDaysFrom20000101(
    big_event, 
    [col for col in big_event.columns if (col not in ["cust_no", "E11"] and "E11" not in col and "E15" not in col and "E17" not in col)]
)
##############
# big_event = changeColName_id(big_event, "big_event")


big_event_E1
big_event_E2
big_event_E3
big_event_E4
big_event_E5
big_event_E6
big_event_E7
big_event_E8
big_event_E9
big_event_E10
big_event_E12
big_event_E13
big_event_E14
big_event_E16
big_event_E18


In [6]:
cunkuan_m1 = changeColName(
    pd.read_csv('originalDataset/x_train/cunkuan_train/cunkuan_m7.csv'), 
    "cunkuan_m1"
)
cunkuan_m2 = changeColName(
    pd.read_csv('originalDataset/x_train/cunkuan_train/cunkuan_m8.csv'), 
    "cunkuan_m2"
)
cunkuan_m3 = changeColName(
    pd.read_csv('originalDataset/x_train/cunkuan_train/cunkuan_m9.csv'), 
    "cunkuan_m3"
)
##########
cunkuan_m = pd.merge(
    cunkuan_m1, cunkuan_m2, 
    how='outer',on='cust_no'
)

cunkuan_m = pd.merge(
    cunkuan_m, cunkuan_m3, 
    how='outer',on='cust_no'
)
############
# cunkuan_m = changeColName_id(cunkuan_m, "cunkuan_m")

`cust_info_qX`, 这个表里面有大量的分类信息，必须要同时结合训练集和测试集一起处理才行。

In [7]:
## cust info, 还是需要结合test测试集。
cust_info_q3 = pd.read_csv('originalDataset/x_train/cust_info_q3.csv')
cust_info_q4 = pd.read_csv('originalDataset/x_train/cust_info_q4.csv')
cust_info_q1 = pd.read_csv('originalDataset/x_test/cust_info_q1.csv')

In [8]:
numerical_fea = list(cust_info_q3.select_dtypes(exclude=['object']).columns)
object_fea = list(filter(lambda x: x not in numerical_fea,list(cust_info_q3.columns)))
object_fea.remove("cust_no")
object_fea.remove("I3")

In [9]:
def mapTheValue(data, fea, dic):
    """
    data_train is the dataset. 
    fea is the target feature. 
    dic is the mapping dictionary. 
    """
    data[fea] = data[fea].apply(lambda x: dic.get(x, -1)) 


## 这两个特征，是暗含了顺序
for dt in [cust_info_q3]: 
    mapTheValue(
        dt, "I3", {
            "普通客户": 0, 
            "黄金": 1,
            "白金": 2,
            "钻石": 3,
        }
    )
    
for col in tqdm.tqdm(object_fea): 
    le = LabelEncoder()
    le.fit(list(cust_info_q3[col].astype(str).values) + list(cust_info_q4[col].astype(str).values) + list(cust_info_q1[col].astype(str).values))
    cust_info_q3[col] = le.transform(list(cust_info_q3[col].astype(str).values))
    
###################
# cust_info_q4 = changeColName_id(cust_info_q4, "cust_info")

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.63s/it]


# Use Featuretools to merge tables and generate new features

利用featuretools来组合诸多表格。这些表格中，以cust_info为最顶层的数据表。

跑这些东西耗时太长，最好只跑一次，结果存本地就行了。

In [10]:
def generateFeaturesWithFeaturetools(
    entity_id_core, dataframe_core, index_core, 
    etyIds, dfs, dfIndexs, 
):
    '''
    一些教程：https://www.jiqizhixin.com/articles/2018-06-21-2
    ft.dfs()的文档：https://featuretools.alteryx.com/en/stable/generated/featuretools.dfs.html?highlight=dfs
    '''
    
    if not os.path.exists("preprocessedData/featuretools-agg-q3-scheme1"):
        os.makedirs("preprocessedData/featuretools-agg-q3-scheme1")
        
    for etyId, df, dfIndex in zip(etyIds, dfs, dfIndexs):
        ## 创建两个表格。其中第一个表是以cust_info_qx来创建，我们便要以此为据，来为每一个用户创建新特征。
        es = ft.EntitySet(id = 'cust')
        ### 注意这里哦，xxx_q4
        es = es.entity_from_dataframe(entity_id = entity_id_core, dataframe = dataframe_core, 
                                  index = index_core)
        es = es.entity_from_dataframe(entity_id = etyId, dataframe = df, 
                                  make_index = True, ## 如果是第一次跑这个代码框，那就要跑这一行，因为这一行会给dataframe增加一列，所以运行过一次再运行，是会报错的。
                                  index = dfIndex)
        ## 增加表之间的关系
        es = es.add_relationship(
            ft.Relationship(
                es['cust_info']['cust_no'],
                es[etyId]['cust_no']
            )
        )
        ## 最后计算新特征. 注意，这里的agg所有的取值，已经是ft.dfs函数支持的所有了。
        for agg in ["sum", "max", "mode", "min", "count", "num_unique", "mean", "percent_true", "std", "skew", ]: #
            print(f"Doing {etyId} {agg}...")
            features, feature_names = ft.dfs(
                entityset = es, 
                target_entity = 'cust_info', 
                agg_primitives = [agg],
                verbose=1, 
                max_depth = 2, 
            )
            ## 命名的关键信息：etyId, agg
            features = features.reset_index()
            features.to_csv(f"preprocessedData/featuretools-agg-q3-scheme1/{etyId}-{agg}.csv", index=False)
            
generateFeaturesWithFeaturetools(
    "cust_info", cust_info_q3, "cust_no", 
    ["aum_m", "behavior_m", "cunkuan_m", "big_event", ], # 
    [aum_m, behavior_m, cunkuan_m, big_event, ], #
    ["aum_id", "behavior_id", "cunkuan_id", "bigEvent_id", ], # 
)

Doing aum_m sum...
Built 44 features
Elapsed: 00:12 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m max...
Built 44 features
Elapsed: 00:13 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m mode...
Built 20 features
Elapsed: 00:06 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m min...
Built 44 features
Elapsed: 00:13 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m count...
Built 21 features
Elapsed: 00:08 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m num_unique...
Built 20 features
Elapsed: 00:06 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Doing aum_m mean...
Built 44 features
Elapsed: 0

因为`entity_from_dataframe`有`time_index`这个参数，所以之前做的时间类型的转换，可以酌情不用做了。

https://featuretools.alteryx.com/en/stable/generated/featuretools.EntitySet.entity_from_dataframe.html?highlight=entity_from_dataframe

# Combine the features generated by `featuretools`

这部分代码，本notebook不需要用，但是留在这边做备份。

In [102]:
counter = 0
newFeatures = pd.DataFrame()
for etyId in ["aum_m", "behavior_m", "cunkuan_m", "big_event", ]:
    for agg in ["sum", "max", "mode", "min", "count", "num_unique", "mean", "percent_true", "std", "skew", ]: 
        counter += 1
        print(f"preprocessedData/{etyId}-{agg}.csv", counter)
        features = pd.read_csv(f"preprocessedData/{etyId}-{agg}.csv")
        ## 把原始的Ixx列都删掉，然后和原列整合
        features.drop(
            [col for col in features.columns if re.search("^I\d+$", col) != None] + ["cust_no"], 
            axis = 1, 
            inplace=True
        )
        newFeatures = pd.concat([newFeatures, features], axis=1)
print(newFeatures.shape)

preprocessedData/aum_m-sum.csv 1
preprocessedData/aum_m-max.csv 2
preprocessedData/aum_m-mode.csv 3
preprocessedData/aum_m-min.csv 4
preprocessedData/aum_m-count.csv 5
preprocessedData/aum_m-num_unique.csv 6
preprocessedData/aum_m-mean.csv 7
preprocessedData/aum_m-percent_true.csv 8
preprocessedData/aum_m-std.csv 9
preprocessedData/aum_m-skew.csv 10
preprocessedData/behavior_m-sum.csv 11
preprocessedData/behavior_m-max.csv 12
preprocessedData/behavior_m-mode.csv 13
preprocessedData/behavior_m-min.csv 14
preprocessedData/behavior_m-count.csv 15
preprocessedData/behavior_m-num_unique.csv 16
preprocessedData/behavior_m-mean.csv 17
preprocessedData/behavior_m-percent_true.csv 18
preprocessedData/behavior_m-std.csv 19
preprocessedData/behavior_m-skew.csv 20
preprocessedData/cunkuan_m-sum.csv 21
preprocessedData/cunkuan_m-max.csv 22
preprocessedData/cunkuan_m-mode.csv 23
preprocessedData/cunkuan_m-min.csv 24
preprocessedData/cunkuan_m-count.csv 25
preprocessedData/cunkuan_m-num_unique.csv 26