In [476]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

In [477]:
train = pd.read_csv('data_train.csv')
test = pd.read_csv('data_test.csv')

In [478]:
train

Unnamed: 0,id,time,customer_id,age_group,gender,zipcode_customer,merchant_id,zipcode_merchant,type,amount,fraud
0,0,18,'C1734879586','1','F','28007','M348934600','28007','es_transportation',24.03,0
1,1,125,'C1896147467','2','F','28007','M1823072687','28007','es_transportation',41.08,0
2,2,7,'C1590346257','1','F','28007','M348934600','28007','es_transportation',37.59,0
3,3,66,'C16891369','3','M','28007','M348934600','28007','es_transportation',51.59,0
4,4,140,'C635222317','3','F','28007','M1823072687','28007','es_transportation',20.17,0
...,...,...,...,...,...,...,...,...,...,...,...
296503,296503,25,'C1200336063','2','M','28007','M1823072687','28007','es_transportation',39.51,0
296504,296504,164,'C1632035340','6','M','28007','M348934600','28007','es_transportation',5.44,0
296505,296505,59,'C521436442','3','F','28007','M348934600','28007','es_transportation',19.65,0
296506,296506,96,'C620542168','2','M','28007','M348934600','28007','es_transportation',31.03,0


In [479]:
test

Unnamed: 0,id,time,customer_id,age_group,gender,zipcode_customer,merchant_id,zipcode_merchant,type,amount
0,0,168,'C969635352','2','F','28007','M1823072687','28007','es_transportation',16.11
1,1,60,'C1067668919','2','F','28007','M348934600','28007','es_transportation',9.80
2,2,170,'C1347371891','2','M','28007','M1823072687','28007','es_transportation',30.81
3,3,124,'C1100053869','2','M','28007','M1823072687','28007','es_transportation',54.82
4,4,159,'C1110919755','1','F','28007','M348934600','28007','es_transportation',41.02
...,...,...,...,...,...,...,...,...,...,...
298130,298130,43,'C572709519','4','F','28007','M1823072687','28007','es_transportation',45.80
298131,298131,30,'C1740186506','3','M','28007','M348934600','28007','es_transportation',12.62
298132,298132,25,'C1594422820','4','F','28007','M348934600','28007','es_transportation',22.91
298133,298133,7,'C918296398','2','F','28007','M348934600','28007','es_transportation',19.54


In [480]:
#无缺失值
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296508 entries, 0 to 296507
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                296508 non-null  int64  
 1   time              296508 non-null  int64  
 2   customer_id       296508 non-null  object 
 3   age_group         296508 non-null  object 
 4   gender            296508 non-null  object 
 5   zipcode_customer  296508 non-null  object 
 6   merchant_id       296508 non-null  object 
 7   zipcode_merchant  296508 non-null  object 
 8   type              296508 non-null  object 
 9   amount            296508 non-null  float64
 10  fraud             296508 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 24.9+ MB


In [481]:
#可去掉‘zipcode_customer’和‘zipcode_merchant’特征
features = train.columns
for feature in features:
    print(feature, train[feature].nunique())

id 296508
time 180
customer_id 2056
age_group 8
gender 4
zipcode_customer 1
merchant_id 50
zipcode_merchant 1
type 15
amount 17683
fraud 2


### 一、初步特征工程
原始特征处理以及增加基于原始特征的统计特征

In [482]:
#1.去掉‘zipcode_customer’和‘zipcode_merchant’特征
train.drop(['zipcode_customer','zipcode_merchant'], axis=1, inplace=True)
test.drop(['zipcode_customer','zipcode_merchant'], axis=1, inplace=True)

In [483]:
features = test.columns   
for feature in features:
    print(feature, test[feature].nunique())

id 298135
time 180
customer_id 2056
age_group 8
gender 4
merchant_id 50
type 15
amount 18084


In [484]:
df_all = pd.concat([train, test])

In [485]:
#训练集的customer_id数量 + 测试集的customer_id数量 = 全集customer_id数量
#在训练集和验证集上，customer_id没有重复值
features = df_all.columns   
for feature in features:
    print(feature, df_all[feature].nunique())

id 298135
time 180
customer_id 4112
age_group 8
gender 4
merchant_id 50
type 15
amount 23772
fraud 2


探寻customer_id和merchant_id之间的关系

In [486]:
#同一个customer_id，可能不会发生欺诈行为，也可能会发生欺诈行为
#2.预测为每笔交易的欺诈行为，考虑把train和test进行合并，获得customer_id统一变换
train.groupby(['customer_id','fraud']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,time,age_group,gender,merchant_id,type,amount
customer_id,fraud,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'C1000148617',0,130,130,130,130,130,130,130
'C1000148617',1,1,1,1,1,1,1,1
'C1004109477',0,162,162,162,162,162,162,162
'C1005126300',0,178,178,178,178,178,178,178
'C1005495267',0,177,177,177,177,177,177,177
...,...,...,...,...,...,...,...,...
'C99729647',0,175,175,175,175,175,175,175
'C998690782',0,169,169,169,169,169,169,169
'C998690782',1,2,2,2,2,2,2,2
'C999723254',0,121,121,121,121,121,121,121


In [487]:
#3.把object特征转化为数值特征
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_all['customer_id2'] = le.fit_transform(df_all['customer_id'])
df_all['merchant_id2'] = le.fit_transform(df_all['merchant_id'])
df_all['gender'] = le.fit_transform(df_all['gender'])
df_all['type'] = le.fit_transform(df_all['type'])

In [488]:
df_all['age_group'].value_counts()

'2'    187310
'3'    147131
'4'    109025
'5'     62642
'1'     58131
'6'     26774
'0'      2452
'U'      1178
Name: age_group, dtype: int64

In [489]:
df_all['age_group'].value_counts().index.tolist()

["'2'", "'3'", "'4'", "'5'", "'1'", "'6'", "'0'", "'U'"]

In [490]:
#4.把object特征转化为数值特征
dict1 = {"'2'":2, "'3'":3, "'4'":4, "'5'":5, "'1'":1, "'6'":6, "'0'":0, "'U'":7}
df_all['age_group'] = df_all['age_group'].map(dict1)

In [491]:
#通过后面模型验证和feature_importance，取_count特征足以，另外merchant_id_count无特别大的作用；
#5.只取类别特征gender, type, age_group的_count特征，无需取_mean和_std相关特征；
cat_features = ['age_group', 'gender', 'type'] 
for col in cat_features:
    temp = df_all.groupby(col, as_index=False)[col].agg({col+'_count': 'count'}) #,col+'_mean':'mean',col+'_std':'std'})
    df_all = pd.merge(df_all, temp, on=col, how='left')

In [492]:
#从后面模型验证可看到，merchant_id和customer_id在模型中，起到非常重要作用，构建此两个特征的交叉统计特征
#6.交叉统计特征：mer_cust_count','mer_cust_mean','mer_cust_std'
temp1 = df_all.groupby('merchant_id2', as_index=False)['customer_id2'].agg({'mer_cust_count': 'count','mer_cust_mean':'mean','mer_cust_std':'std'})
df_all = pd.merge(df_all, temp1, on='merchant_id2', how='left')

In [493]:
#df_all.drop('merchant_id',axis=1, inplace=True)
#df_all.drop('customer_id',axis=1, inplace=True)
df_all

Unnamed: 0,id,time,customer_id,age_group,gender,merchant_id,type,amount,fraud,customer_id2,merchant_id2,age_group_count,gender_count,type_count,mer_cust_count,mer_cust_mean,mer_cust_std
0,0,18,'C1734879586',1,1,'M348934600',12,24.03,0.0,1603,30,58131,324565,505119,205426,2067.895719,1191.492350
1,1,125,'C1896147467',2,1,'M1823072687',12,41.08,0.0,1936,18,187310,324565,505119,299693,2021.446050,1180.380772
2,2,7,'C1590346257',1,1,'M348934600',12,37.59,0.0,1283,30,58131,324565,505119,205426,2067.895719,1191.492350
3,3,66,'C16891369',3,2,'M348934600',12,51.59,0.0,1491,30,147131,268385,505119,205426,2067.895719,1191.492350
4,4,140,'C635222317',3,1,'M1823072687',12,20.17,0.0,3337,18,147131,324565,505119,299693,2021.446050,1180.380772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594638,298130,43,'C572709519',4,1,'M1823072687',12,45.80,,3194,18,109025,324565,505119,299693,2021.446050,1180.380772
594639,298131,30,'C1740186506',3,2,'M348934600',12,12.62,,1617,30,147131,268385,505119,205426,2067.895719,1191.492350
594640,298132,25,'C1594422820',4,1,'M348934600',12,22.91,,1293,30,109025,324565,505119,205426,2067.895719,1191.492350
594641,298133,7,'C918296398',2,1,'M348934600',12,19.54,,3949,30,187310,324565,505119,205426,2067.895719,1191.492350


In [None]:
#df_train = df_all[df_all['fraud'].notnull()]
#df_test = df_all[df_all['fraud'].isnull()]
#df_train

### 二、特征工程进阶
构建基于customer_id和merchant_id的tfidf特征

In [None]:
#各类_count特征间相关性比较高
colormap = plt.cm.RdBu
plt.figure(figsize=(15,15))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(df_train.drop('id', axis=1).astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [654]:
#1.基于customer_id,把merchant_id连接起来
def get_merchant(df):
    df_file = df.groupby('customer_id')
    file_merch = {}
    for file_id,file_group in df_file:
        result = file_group.sort_values(['id'], ascending=True)
        merch_seq = ' '.join(result['merchant_id'])
        file_merch[file_id] = merch_seq 
    return file_merch

In [655]:
data1 = get_merchant(df_all)
#data1

In [656]:
#2.获得有customer_id和基于customer_id的merchant_id连接起来的字符串dataframe
df_merch = pd.DataFrame.from_dict(data1, orient='index' , columns=['merchant_seq'])
df_merch = df_merch.reset_index().rename(columns={'index':'customer_id'})
df_merch

Unnamed: 0,customer_id,merchant_seq
0,'C1000148617','M1823072687' 'M348934600' 'M1823072687' 'M348...
1,'C100045114','M348934600' 'M1823072687' 'M85975013' 'M18230...
2,'C1000699316','M1823072687' 'M85975013' 'M1823072687' 'M3489...
3,'C1001065306','M840466850' 'M2122776122' 'M692898500' 'M1737...
4,'C1002658784','M348934600' 'M348934600' 'M1823072687' 'M1823...
...,...,...
4107,'C99729647','M1823072687' 'M348934600' 'M348934600' 'M3489...
4108,'C998690782','M348934600' 'M1823072687' 'M348934600' 'M3489...
4109,'C998987490','M1823072687' 'M1823072687' 'M1053599405' 'M18...
4110,'C999393223','M348934600' 'M348934600' 'M348934600' 'M34893...


In [497]:
#3.使用1-3元tfidf语法
vec = TfidfVectorizer(ngram_range=(1,3),min_df=0.1, max_df=0.8)

In [498]:
merch_features = vec.fit_transform(df_merch['merchant_seq'])
merch_features

<4112x170 sparse matrix of type '<class 'numpy.float64'>'
	with 191586 stored elements in Compressed Sparse Row format>

In [499]:
#4.获得基于customer_id的merchant_id的tdidf特征dataframe
df_merchs = pd.DataFrame(merch_features.toarray(), columns = vec.get_feature_names())
df_merchs

Unnamed: 0,m1053599405,m1053599405 m1823072687,m1053599405 m1823072687 m1823072687,m1053599405 m1823072687 m348934600,m1053599405 m348934600,m1053599405 m348934600 m1823072687,m1053599405 m348934600 m348934600,m1198415165,m1198415165 m348934600,m1313686961,...,m85975013 m1823072687 m1823072687,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600
0,0.024141,0.028986,0.000000,0.049456,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.051450,0.000000,0.047545,0.025170,0.000000,0.036221,0.041815,0.0,0.000000,0.000000
1,0.174191,0.052288,0.000000,0.000000,0.069550,0.000000,0.097371,0.304698,0.298962,0.000000,...,0.000000,0.000000,0.000000,0.090808,0.055713,0.065340,0.000000,0.0,0.207436,0.000000
2,0.057151,0.000000,0.000000,0.000000,0.091276,0.116103,0.000000,0.099970,0.130784,0.000000,...,0.121804,0.072901,0.000000,0.178763,0.146233,0.000000,0.000000,0.0,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.174640,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.288023
4,0.065501,0.000000,0.000000,0.000000,0.052306,0.000000,0.073229,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.102440,0.083799,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,0.045528,0.027333,0.031966,0.000000,0.036356,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.048516,0.029037,0.000000,0.071204,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.026812,0.000000,0.021916,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4109,0.085039,0.085089,0.099513,0.000000,0.022636,0.000000,0.000000,0.000000,0.000000,0.032391,...,0.135930,0.000000,0.027914,0.014777,0.000000,0.021266,0.000000,0.0,0.000000,0.000000
4110,0.047541,0.028541,0.000000,0.048698,0.037964,0.000000,0.053150,0.083160,0.108793,0.000000,...,0.000000,0.000000,0.000000,0.123920,0.030411,0.106998,0.000000,0.0,0.113230,0.000000


In [505]:
#5.把customer_id加进来，用于后面与df_all列表连接
df_merch1 = df_merch.merge(df_merchs, left_index=True, right_index=True)
df_merch1

Unnamed: 0,customer_id,merchant_seq,m1053599405,m1053599405 m1823072687,m1053599405 m1823072687 m1823072687,m1053599405 m1823072687 m348934600,m1053599405 m348934600,m1053599405 m348934600 m1823072687,m1053599405 m348934600 m348934600,m1198415165,...,m85975013 m1823072687 m1823072687,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600
0,'C1000148617','M1823072687' 'M348934600' 'M1823072687' 'M348...,0.024141,0.028986,0.000000,0.049456,0.000000,0.000000,0.000000,0.000000,...,0.051450,0.000000,0.047545,0.025170,0.000000,0.036221,0.041815,0.0,0.000000,0.000000
1,'C100045114','M348934600' 'M1823072687' 'M85975013' 'M18230...,0.174191,0.052288,0.000000,0.000000,0.069550,0.000000,0.097371,0.304698,...,0.000000,0.000000,0.000000,0.090808,0.055713,0.065340,0.000000,0.0,0.207436,0.000000
2,'C1000699316','M1823072687' 'M85975013' 'M1823072687' 'M3489...,0.057151,0.000000,0.000000,0.000000,0.091276,0.116103,0.000000,0.099970,...,0.121804,0.072901,0.000000,0.178763,0.146233,0.000000,0.000000,0.0,0.000000,0.000000
3,'C1001065306','M840466850' 'M2122776122' 'M692898500' 'M1737...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.174640,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.288023
4,'C1002658784','M348934600' 'M348934600' 'M1823072687' 'M1823...,0.065501,0.000000,0.000000,0.000000,0.052306,0.000000,0.073229,0.000000,...,0.000000,0.000000,0.000000,0.102440,0.083799,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,'C99729647','M1823072687' 'M348934600' 'M348934600' 'M3489...,0.045528,0.027333,0.031966,0.000000,0.036356,0.000000,0.000000,0.000000,...,0.048516,0.029037,0.000000,0.071204,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4108,'C998690782','M348934600' 'M1823072687' 'M348934600' 'M3489...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.026812,0.000000,0.021916,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4109,'C998987490','M1823072687' 'M1823072687' 'M1053599405' 'M18...,0.085039,0.085089,0.099513,0.000000,0.022636,0.000000,0.000000,0.000000,...,0.135930,0.000000,0.027914,0.014777,0.000000,0.021266,0.000000,0.0,0.000000,0.000000
4110,'C999393223','M348934600' 'M348934600' 'M348934600' 'M34893...,0.047541,0.028541,0.000000,0.048698,0.037964,0.000000,0.053150,0.083160,...,0.000000,0.000000,0.000000,0.123920,0.030411,0.106998,0.000000,0.0,0.113230,0.000000


In [506]:
#6.df_merch1与df_all左联
df2_all = df_all.merge(df_merch1, on='customer_id', how='left')
df2_all

Unnamed: 0,id,time,customer_id,age_group,gender,merchant_id,type,amount,fraud,customer_id2,...,m85975013 m1823072687 m1823072687,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600
0,0,18,'C1734879586',1,1,'M348934600',12,24.03,0.0,1603,...,0.179368,0.061345,0.000000,0.100284,0.092289,0.036079,0.041651,0.047172,0.000000,0.0
1,1,125,'C1896147467',2,1,'M1823072687',12,41.08,0.0,1936,...,0.071062,0.042532,0.000000,0.104293,0.042657,0.050028,0.000000,0.000000,0.079413,0.0
2,2,7,'C1590346257',1,1,'M348934600',12,37.59,0.0,1283,...,0.183121,0.039855,0.030768,0.081441,0.059959,0.046879,0.027060,0.030647,0.000000,0.0
3,3,66,'C16891369',3,2,'M348934600',12,51.59,0.0,1491,...,0.028522,0.000000,0.000000,0.111625,0.102726,0.000000,0.000000,0.000000,0.000000,0.0
4,4,140,'C635222317',3,1,'M1823072687',12,20.17,0.0,3337,...,0.052175,0.062455,0.000000,0.068066,0.020880,0.048976,0.056540,0.032017,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594638,298130,43,'C572709519',4,1,'M1823072687',12,45.80,,3194,...,0.054851,0.000000,0.033792,0.000000,0.000000,0.000000,0.029720,0.033659,0.000000,0.0
594639,298131,30,'C1740186506',3,2,'M348934600',12,12.62,,1617,...,0.084959,0.000000,0.000000,0.138542,0.169997,0.000000,0.046033,0.000000,0.126590,0.0
594640,298132,25,'C1594422820',4,1,'M348934600',12,22.91,,1293,...,0.078036,0.062274,0.000000,0.025451,0.031229,0.000000,0.000000,0.000000,0.000000,0.0
594641,298133,7,'C918296398',2,1,'M348934600',12,19.54,,3949,...,0.027888,0.066766,0.000000,0.054573,0.000000,0.078534,0.000000,0.000000,0.000000,0.0


In [507]:
#7.因已有'customer_id2','merchant_id2'数值特征，删除'customer_id','merchant_id'字符特征，删除‘merchant_seq’特征
df2_all.drop(['customer_id','merchant_id','merchant_seq'],axis=1,inplace=True)

In [508]:
#8.分开df_all，得到训练集和验证集
df_train = df2_all[df2_all['fraud'].notnull()]
df_test = df2_all[df2_all['fraud'].isnull()]
df_train
#df_test

Unnamed: 0,id,time,age_group,gender,type,amount,fraud,customer_id2,merchant_id2,age_group_count,...,m85975013 m1823072687 m1823072687,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600
0,0,18,1,1,12,24.03,0.0,1603,30,58131,...,0.179368,0.061345,0.000000,0.100284,0.092289,0.036079,0.041651,0.047172,0.000000,0.000000
1,1,125,2,1,12,41.08,0.0,1936,18,187310,...,0.071062,0.042532,0.000000,0.104293,0.042657,0.050028,0.000000,0.000000,0.079413,0.000000
2,2,7,1,1,12,37.59,0.0,1283,30,58131,...,0.183121,0.039855,0.030768,0.081441,0.059959,0.046879,0.027060,0.030647,0.000000,0.000000
3,3,66,3,2,12,51.59,0.0,1491,30,147131,...,0.028522,0.000000,0.000000,0.111625,0.102726,0.000000,0.000000,0.000000,0.000000,0.000000
4,4,140,3,1,12,20.17,0.0,3337,18,147131,...,0.052175,0.062455,0.000000,0.068066,0.020880,0.048976,0.056540,0.032017,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296503,296503,25,2,2,12,39.51,0.0,447,18,187310,...,0.085961,0.000000,0.022696,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
296504,296504,164,6,2,12,5.44,0.0,1366,30,26774,...,0.043339,0.000000,0.000000,0.169616,0.104063,0.061022,0.000000,0.000000,0.000000,0.000000
296505,296505,59,3,1,12,19.65,0.0,3102,30,147131,...,0.061605,0.036872,0.000000,0.060276,0.073961,0.000000,0.000000,0.000000,0.000000,0.000000
296506,296506,96,2,2,12,31.03,0.0,3312,30,187310,...,0.000000,0.000000,0.000000,0.011441,0.000000,0.000000,0.000000,0.000000,0.052269,0.000000


### 三、其他特征工程优化
创建基于customer_id的统计特征

In [739]:
df_merch_n = df_merch.copy()
#df_merch_n

In [740]:
#1.构造新的特征，基于customer_id的聚合统计
def get_features(df):
    #按照customer_id分组，提取统计特征
    df_file = df.groupby('customer_id')
        
    #提取多个特征，统计特征api, tid, index
    features = ['time', 'age_group', 'gender','type','amount','merchant_id2']
    #针对file_id构造不同的特征，一个file_id只有一行数据
    for f in features:
        df_merch_n[f+'_count_cu'] = df_file[f].count().values
        df_merch_n[f+'_nunique'] = df_file[f].nunique().values
        df_merch_n[f+'_max'] = df_file[f].max().values
        df_merch_n[f+'_min'] = df_file[f].min().values
        df_merch_n[f+'_mean'] = df_file[f].mean().values
        df_merch_n[f+'_std'] = df_file[f].std().values
        df_merch_n[f+'_ptp'] = df_file[f].max().values - df_file[f].min().values
    return df_merch_n

In [741]:
df_merch3 = get_features(df_all)
df_merch3

Unnamed: 0,customer_id,merchant_seq,time_count_cu,time_nunique,time_max,time_min,time_mean,time_std,time_ptp,age_group_count_cu,...,amount_mean,amount_std,amount_ptp,merchant_id2_count_cu,merchant_id2_nunique,merchant_id2_max,merchant_id2_min,merchant_id2_mean,merchant_id2_std,merchant_id2_ptp
0,'C1000148617','M1823072687' 'M348934600' 'M1823072687' 'M348...,131,128,179,30,107.786260,40.826652,149,131,...,35.091908,36.863536,322.84,131,14,45,0,21.595420,7.547571,45
1,'C100045114','M348934600' 'M1823072687' 'M85975013' 'M18230...,109,91,179,0,124.532110,43.704938,179,109,...,90.336239,381.812802,3901.56,109,22,48,0,24.807339,11.076733,48
2,'C1000699316','M1823072687' 'M85975013' 'M1823072687' 'M3489...,94,93,173,0,51.436170,35.752001,173,94,...,38.154894,41.544414,325.35,94,10,45,0,23.255319,8.869346,45
3,'C1001065306','M840466850' 'M2122776122' 'M692898500' 'M1737...,30,21,155,21,91.833333,43.885496,134,30,...,204.754667,306.240251,1259.89,30,11,49,2,30.733333,13.434216,47
4,'C1002658784','M348934600' 'M348934600' 'M1823072687' 'M1823...,131,128,179,22,107.702290,42.194366,157,131,...,34.253282,27.820062,196.71,131,17,45,0,26.007634,8.434814,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,'C99729647','M1823072687' 'M348934600' 'M348934600' 'M3489...,175,169,178,0,89.891429,52.325514,178,175,...,32.555771,30.641718,241.41,175,14,45,0,22.560000,8.082591,45
4108,'C998690782','M348934600' 'M1823072687' 'M348934600' 'M3489...,171,168,179,0,89.005848,52.749073,179,171,...,61.211637,421.233339,5526.96,171,11,45,9,25.959064,7.212209,36
4109,'C998987490','M1823072687' 'M1823072687' 'M1053599405' 'M18...,177,170,179,0,90.107345,53.182520,179,177,...,32.273107,30.122381,219.45,177,8,45,0,20.514124,8.598432,45
4110,'C999393223','M348934600' 'M348934600' 'M348934600' 'M34893...,142,130,179,6,104.992958,45.159056,173,142,...,36.018944,48.619220,476.31,142,16,48,0,28.183099,8.717489,48


In [703]:
features = df_merch3.columns
for feature in features:
    if df_merch3[feature].nunique()==1:
        print(feature, df_merch3[feature].nunique())

age_group_nunique 1
age_group_std 1
age_group_ptp 1
gender_nunique 1
gender_std 1
gender_ptp 1


In [742]:
#2.移除一些只有一个值的特征
remov= ['age_group_nunique', 'age_group_std','age_group_ptp','gender_nunique', 'gender_std', 'gender_ptp','merchant_seq']
df_merch3.drop(remov, axis=1, inplace=True)

In [743]:
#3.df_merch3与df_all左联，获得基于customer_id统计特征的全数据dataframe
df3_all = df_all.merge(df_merch3, on='customer_id', how='left')
#df3_all

In [706]:
#4.获得基于customer_id的统计特征和tdidf特征
df_merch4 = df_merch3.merge(df_merch1, on='customer_id', how='left')
df_merch4

Unnamed: 0,customer_id,time_count_cu,time_nunique,time_max,time_min,time_mean,time_std,time_ptp,age_group_count_cu,age_group_max,...,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600,merchant_seq_len
0,'C1000148617',131,128,179,30,107.786260,40.826652,149,131,5,...,0.000000,0.047545,0.025170,0.000000,0.036221,0.041815,0.0,0.000000,0.000000,1795
1,'C100045114',109,91,179,0,124.532110,43.704938,179,109,4,...,0.000000,0.000000,0.090808,0.055713,0.065340,0.000000,0.0,0.207436,0.000000,1456
2,'C1000699316',94,93,173,0,51.436170,35.752001,173,94,4,...,0.072901,0.000000,0.178763,0.146233,0.000000,0.000000,0.0,0.000000,0.000000,1273
3,'C1001065306',30,21,155,21,91.833333,43.885496,134,30,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.288023,388
4,'C1002658784',131,128,179,22,107.702290,42.194366,157,131,3,...,0.000000,0.000000,0.102440,0.083799,0.000000,0.000000,0.0,0.000000,0.000000,1742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,'C99729647',175,169,178,0,89.891429,52.325514,178,175,3,...,0.029037,0.000000,0.071204,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,2382
4108,'C998690782',171,168,179,0,89.005848,52.749073,179,171,2,...,0.026812,0.000000,0.021916,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,2277
4109,'C998987490',177,170,179,0,90.107345,53.182520,179,177,2,...,0.000000,0.027914,0.014777,0.000000,0.021266,0.000000,0.0,0.000000,0.000000,2435
4110,'C999393223',142,130,179,6,104.992958,45.159056,173,142,1,...,0.000000,0.000000,0.123920,0.030411,0.106998,0.000000,0.0,0.113230,0.000000,1865


In [707]:
#5.df_merch3与df_all左联，获得基于customer_id统计特征的全数据dataframe
df4_all = df_all.merge(df_merch4, on='customer_id', how='left')
df4_all

Unnamed: 0,id,time,customer_id,age_group,gender,merchant_id,type,amount,fraud,customer_id2,...,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600,merchant_seq_len
0,0,18,'C1734879586',1,1,'M348934600',12,24.03,0.0,1603,...,0.061345,0.000000,0.100284,0.092289,0.036079,0.041651,0.047172,0.000000,0.0,2409
1,1,125,'C1896147467',2,1,'M1823072687',12,41.08,0.0,1936,...,0.042532,0.000000,0.104293,0.042657,0.050028,0.000000,0.000000,0.079413,0.0,2140
2,2,7,'C1590346257',1,1,'M348934600',12,37.59,0.0,1283,...,0.039855,0.030768,0.081441,0.059959,0.046879,0.027060,0.030647,0.000000,0.0,2298
3,3,66,'C16891369',3,2,'M348934600',12,51.59,0.0,1491,...,0.000000,0.000000,0.111625,0.102726,0.000000,0.000000,0.000000,0.000000,0.0,2274
4,4,140,'C635222317',3,1,'M1823072687',12,20.17,0.0,3337,...,0.062455,0.000000,0.068066,0.020880,0.048976,0.056540,0.032017,0.000000,0.0,2396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594638,298130,43,'C572709519',4,1,'M1823072687',12,45.80,,3194,...,0.000000,0.033792,0.000000,0.000000,0.000000,0.029720,0.033659,0.000000,0.0,1604
594639,298131,30,'C1740186506',3,2,'M348934600',12,12.62,,1617,...,0.000000,0.000000,0.138542,0.169997,0.000000,0.046033,0.000000,0.126590,0.0,2366
594640,298132,25,'C1594422820',4,1,'M348934600',12,22.91,,1293,...,0.062274,0.000000,0.025451,0.031229,0.000000,0.000000,0.000000,0.000000,0.0,2422
594641,298133,7,'C918296398',2,1,'M348934600',12,19.54,,3949,...,0.066766,0.000000,0.054573,0.000000,0.078534,0.000000,0.000000,0.000000,0.0,2394


In [746]:
#6.因已有'customer_id2','merchant_id2'数值特征，删除'customer_id','merchant_id'字符特征,删除'merchant_seq‘
df3_all.drop(['customer_id','merchant_id'],axis=1,inplace=True)

In [748]:
#7.分开df_all，得到训练集和验证集
df_train = df3_all[df3_all['fraud'].notnull()]
df_test = df3_all[df3_all['fraud'].isnull()]
df_train
#df_test

Unnamed: 0,id,time,age_group,gender,type,amount,fraud,customer_id2,merchant_id2,age_group_count,...,amount_mean,amount_std,amount_ptp,merchant_id2_count_cu,merchant_id2_nunique,merchant_id2_max,merchant_id2_min,merchant_id2_mean,merchant_id2_std,merchant_id2_ptp
0,0,18,1,1,12,24.03,0.0,1603,30,58131,...,30.676011,34.283355,307.36,178,11,45,0,23.511236,9.245087,45
1,1,125,2,1,12,41.08,0.0,1936,18,187310,...,33.788437,26.224662,171.21,160,15,48,9,24.931250,8.699109,39
2,2,7,1,1,12,37.59,0.0,1283,30,58131,...,29.398639,20.849127,131.48,169,8,45,0,23.017751,10.025844,45
3,3,66,3,2,12,51.59,0.0,1491,30,147131,...,25.027083,19.072839,89.98,168,8,45,0,23.738095,7.543279,45
4,4,140,3,1,12,20.17,0.0,3337,18,147131,...,36.163314,38.324145,310.59,175,7,45,0,21.525714,8.525025,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296503,296503,25,2,2,12,39.51,0.0,447,18,187310,...,32.311893,32.836659,230.51,169,11,45,0,20.514793,7.294899,45
296504,296504,164,6,2,12,5.44,0.0,1366,30,26774,...,41.844435,69.653753,663.17,124,16,45,2,25.846774,8.542619,43
296505,296505,59,3,1,12,19.65,0.0,3102,30,147131,...,27.386786,25.401472,164.89,168,8,45,0,23.785714,7.961090,45
296506,296506,96,2,2,12,31.03,0.0,3312,30,187310,...,38.949766,112.389550,1476.96,171,10,48,9,28.894737,5.910163,39


In [710]:
#df_test

### 四、获得customer_id下merchant_id的embedding（Glove）

In [629]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(lower=False)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x15c49cd30>

In [509]:
df_merch1

Unnamed: 0,customer_id,merchant_seq,m1053599405,m1053599405 m1823072687,m1053599405 m1823072687 m1823072687,m1053599405 m1823072687 m348934600,m1053599405 m348934600,m1053599405 m348934600 m1823072687,m1053599405 m348934600 m348934600,m1198415165,...,m85975013 m1823072687 m1823072687,m85975013 m1823072687 m348934600,m85975013 m1823072687 m85975013,m85975013 m348934600,m85975013 m348934600 m1823072687,m85975013 m348934600 m348934600,m85975013 m85975013,m85975013 m85975013 m1823072687,m97925176,m980657600
0,'C1000148617','M1823072687' 'M348934600' 'M1823072687' 'M348...,0.024141,0.028986,0.000000,0.049456,0.000000,0.000000,0.000000,0.000000,...,0.051450,0.000000,0.047545,0.025170,0.000000,0.036221,0.041815,0.0,0.000000,0.000000
1,'C100045114','M348934600' 'M1823072687' 'M85975013' 'M18230...,0.174191,0.052288,0.000000,0.000000,0.069550,0.000000,0.097371,0.304698,...,0.000000,0.000000,0.000000,0.090808,0.055713,0.065340,0.000000,0.0,0.207436,0.000000
2,'C1000699316','M1823072687' 'M85975013' 'M1823072687' 'M3489...,0.057151,0.000000,0.000000,0.000000,0.091276,0.116103,0.000000,0.099970,...,0.121804,0.072901,0.000000,0.178763,0.146233,0.000000,0.000000,0.0,0.000000,0.000000
3,'C1001065306','M840466850' 'M2122776122' 'M692898500' 'M1737...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.174640,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.288023
4,'C1002658784','M348934600' 'M348934600' 'M1823072687' 'M1823...,0.065501,0.000000,0.000000,0.000000,0.052306,0.000000,0.073229,0.000000,...,0.000000,0.000000,0.000000,0.102440,0.083799,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,'C99729647','M1823072687' 'M348934600' 'M348934600' 'M3489...,0.045528,0.027333,0.031966,0.000000,0.036356,0.000000,0.000000,0.000000,...,0.048516,0.029037,0.000000,0.071204,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4108,'C998690782','M348934600' 'M1823072687' 'M348934600' 'M3489...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.026812,0.000000,0.021916,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4109,'C998987490','M1823072687' 'M1823072687' 'M1053599405' 'M18...,0.085039,0.085089,0.099513,0.000000,0.022636,0.000000,0.000000,0.000000,...,0.135930,0.000000,0.027914,0.014777,0.000000,0.021266,0.000000,0.0,0.000000,0.000000
4110,'C999393223','M348934600' 'M348934600' 'M348934600' 'M34893...,0.047541,0.028541,0.000000,0.048698,0.037964,0.000000,0.053150,0.083160,...,0.000000,0.000000,0.000000,0.123920,0.030411,0.106998,0.000000,0.0,0.113230,0.000000


In [510]:
df_merch1['merchant_seq_len'] = df_merch1['merchant_seq'].map(lambda x: len(x))
df_merch1['merchant_seq_len'].value_counts().sort_index(ascending=False)

3542    1
3282    1
3193    1
2776    1
2679    1
       ..
89      1
80      1
77      1
67      1
66      2
Name: merchant_seq_len, Length: 1448, dtype: int64

In [511]:
df_merch1['merchant_seq_len'].describe()

count    4112.000000
mean     1951.534047
std       586.890996
min        66.000000
25%      1736.750000
50%      2210.000000
75%      2348.250000
max      3542.000000
Name: merchant_seq_len, dtype: float64

In [512]:
max_len = 3000

In [513]:
df_merch1[['merchant_seq']].to_csv('text8.txt', header=None, index=False, sep=" ")

In [592]:
tokenizer.fit_on_texts(df_merch1['merchant_seq'].tolist())

In [593]:
vocab = tokenizer.word_index
print(len(vocab))
vocab

50


{"'M1823072687'": 1,
 "'M348934600'": 2,
 "'M85975013'": 3,
 "'M1053599405'": 4,
 "'M151143676'": 5,
 "'M855959430'": 6,
 "'M1946091778'": 7,
 "'M1913465890'": 8,
 "'M209847108'": 9,
 "'M480139044'": 10,
 "'M349281107'": 11,
 "'M1600850729'": 12,
 "'M1535107174'": 13,
 "'M980657600'": 14,
 "'M78078399'": 15,
 "'M1198415165'": 16,
 "'M840466850'": 17,
 "'M1649169323'": 18,
 "'M547558035'": 19,
 "'M50039827'": 20,
 "'M1888755466'": 21,
 "'M692898500'": 22,
 "'M1400236507'": 23,
 "'M1842530320'": 24,
 "'M732195782'": 25,
 "'M97925176'": 26,
 "'M45060432'": 27,
 "'M1741626453'": 28,
 "'M1313686961'": 29,
 "'M1872033263'": 30,
 "'M1352454843'": 31,
 "'M677738360'": 32,
 "'M2122776122'": 33,
 "'M923029380'": 34,
 "'M3697346'": 35,
 "'M17379832'": 36,
 "'M1748431652'": 37,
 "'M1873032707'": 38,
 "'M2011752106'": 39,
 "'M1416436880'": 40,
 "'M1294758098'": 41,
 "'M1788569036'": 42,
 "'M857378720'": 43,
 "'M348875670'": 44,
 "'M1353266412'": 45,
 "'M933210764'": 46,
 "'M495352832'": 47,
 "'M208

In [594]:
#将句子用字典id替换
x_word_ids = tokenizer.texts_to_sequences(df_merch1['merchant_seq'].tolist())

In [595]:
#填充到固定长度，截长补短（补0）
x_padded_seqs = keras.preprocessing.sequence.pad_sequences(x_word_ids, max_len)

In [596]:
x_padded_seqs.shape

(4112, 3000)

### 基于 customer_id 下的做 merchant_id 的 embedding
以customer_id分类，把得到的merchant_id连接起来的字符串作为glove语料库，训练得到对应的merchant_id的50维度的embedding;

In [588]:
embeddings_index = {}
with open('vectors.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 111 word vectors.


In [556]:
embeddings_index

{"'M1823072687'": array([ 0.675986, -0.075922,  0.151441,  0.158761, -0.034658,  0.472927,
         0.003477, -0.681996, -0.086285, -0.422993,  0.277372, -0.3796  ,
         0.349792,  0.323272, -0.020234,  0.058311, -0.278902, -0.285694,
        -0.139044,  0.507276, -0.341835, -0.865007,  0.581334,  0.035477,
         0.224606,  0.659851, -0.013023, -0.568135, -0.272945, -0.267444,
        -0.707815, -0.894838, -0.373877,  0.018446, -0.217044,  0.96138 ,
        -0.203025,  0.929957, -0.93254 ,  0.363158,  0.093442,  0.475872,
        -1.014866, -0.240486,  0.424623,  1.41615 ,  0.160433,  0.084048,
        -0.404824,  0.016036], dtype=float32),
 "'M348934600'": array([ 0.117292, -0.429149,  0.161984,  0.054493,  0.771681,  0.492368,
         0.0453  ,  0.375068,  0.112358,  0.026556,  0.604166, -0.269543,
         0.814167,  0.497165,  0.106263, -0.171392, -0.778946,  0.344986,
         0.402039,  0.519135, -0.079943, -0.348474,  0.897065, -0.072976,
         0.214117,  0.556293,  0

In [602]:
embedding_matrix = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
for word, i in vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [599]:
embedding_matrix.shape

(51, 50)

In [622]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

In [625]:
model = Sequential()
embedding_layer = Embedding(len(vocab) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
model.add(embedding_layer)
model.compile('rmsprop', 'mse')

In [626]:
#把输入传人Embedding层，得到输入的merchant_seq的embedding向量
output_array = model.predict(x_padded_seqs)

In [628]:
output_array.shape

(4112, 3000, 50)

### 五、模型训练和评估
把原始数据的10%作为验证集，进行模型评估

In [519]:
import model_trainer

In [520]:
import lightgbm as lgb
lgb = lgb.LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
                         max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
                         n_estimators=2000, subsample=1, colsample_bytree=1)

In [521]:
import xgboost as xgb
xgb = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=2000, 
                              objective='binary:logistic', #tree_method='gpu_hist', 
                              subsample=0.8, colsample_bytree=0.8, min_child_samples=3, 
                              eval_metric='auc', reg_lambda=0.5)

#### 1.获得baseline1,配合前面初步特征工程，对特征进行删减

In [None]:
#0.初始特征：(lgb) time, customer_id, age_group, gender, merchant_id, type, amount
#score = 0.6919816998131266
clf0, score0 = model_trainer.trainer(lgb, df_train)
score0

In [522]:
score0

0.6919816998131266

In [None]:
#1.初始特征：(xgb) time, customer_id, age_group, gender, merchant_id, type, amount
#score = 0.7208083312020546
clf1, score1 = model_trainer.trainer(xgb, df_train)
score1

In [523]:
score1

0.7208083312020546

In [None]:
from lightgbm import plot_importance
from matplotlib import pyplot
fig, ax = pyplot.subplots(figsize=(10, 10))
plot_importance(clf0, ax)
pyplot.show()

In [None]:
#2.加上_count, _mean, _std特征
clf2, score2 = model_trainer.trainer(lgb, df_train)
score2

In [525]:
score2

0.6861386028757334

In [None]:
fig, ax = pyplot.subplots(figsize=(10, 10))
plot_importance(clf2, ax)
pyplot.show()

In [None]:
#3.加上_count, _mean, _std特征
clf3, score3 = model_trainer.trainer(xgb, df_train)
score3

In [526]:
score3

0.7087504471755575

In [None]:
#4.去掉customer_id
clf4, score4 = model_trainer.trainer(lgb, df_train.drop(['customer_id'], axis=1))
score4

In [527]:
score4

0.6458139802777159

In [None]:
#5.去掉_mean, _std, 只保留_count
#score = 0.6895874169965035
clf5, score5 = model_trainer.trainer(lgb, df_train)
score5

In [528]:
score5

0.6895874169965035

In [None]:
#6._count, mer_cus_count
clf6, score6 = model_trainer.trainer(lgb, df_train)
score6

In [529]:
score6

0.6861386028757334

In [None]:
fig, ax = pyplot.subplots(figsize=(10, 10))
plot_importance(clf6, ax)
pyplot.show()

In [None]:
#7._count, mer_cus_count, mer_cus_mean, mer_cus_std
clf7, score7 = model_trainer.trainer(lgb, df_train)
score7

In [530]:
score7

0.6979686687165771

In [None]:
#7.2.mer_cus_count, mer_cus_mean, mer_cus_std，去掉_count
clf7_2, score7_2 = model_trainer.trainer(lgb, df_train)
score7_2

In [531]:
score7_2

0.6818620634597591

In [None]:
fig, ax = pyplot.subplots(figsize=(10, 10))
plot_importance(clf7, ax)
pyplot.show()

从模型验证结果和特征重要性来看，保留_count特征和mer_cus_count, mer_cus_mean, mer_cus_std
xgb效果更好

#### 2.增加tdidf特征（一到三元语法，有上下限（0.1，0.8），进行模型训练和评估

In [None]:
#8.tdidf特征,1-3元语法（0.1，0.8），无_count特征，无customer_id, 无merchant_id
clf8, score8 = model_trainer.trainer(lgb, df_train)
score8

In [532]:
score8

0.6985617605292569

In [None]:
#9.tdidf特征,1-3元语法（0.1，0.8），_count特征，mer_cus_count, mer_cus_mean, mer_cus_std 
clf9, score9 = model_trainer.trainer(lgb, df_train)
score9

In [533]:
score9

0.7494630809874008

In [None]:
#9.tdidf特征,1-3元语法（0.1，0.9），_count特征，mer_cus_count, mer_cus_mean, mer_cus_std 
clf9_1, score9_1 = model_trainer.trainer(lgb, df_train)
score9_1

In [534]:
score9_1

0.7410550147300851

#### 3.增加tdidf特征（一到四元语法，有下限（0.1），进行模型训练和评估

In [None]:
#10.tdidf特征,1-4元语法,下限（0.1），_count特征，mer_cus_count, mer_cus_mean, mer_cus_std 
clf10, score10 = model_trainer.trainer(lgb, df_train)
score10

In [535]:
score10

0.7404750469127944

In [None]:
#11.tdidf特征,1-4元语法,下限（0.1），_count特征，mer_cus_count, mer_cus_mean, mer_cus_std 
clf11, score11 = model_trainer.trainer(xgb, df_train)
score11

In [536]:
score11

0.7363528171870456

#### 4.增加基于customer_id的统计特征

In [750]:
#lgb 增加基于customer_id的统计特征，不包含tdidf特征
#score = 0.7775679278624599
clf12, score12 = model_trainer.trainer(lgb, df_train)
score12

(266857, 49) (29651, 49) (266857,) (29651,)


0.7775679278624599

In [711]:
#增加基于customer_id的统计特征，包含tdidf特征
clf12_1, score12_1 = model_trainer.trainer(lgb, df_train)
score12_1

(266857, 220) (29651, 220) (266857,) (29651,)


0.7724127921919

In [712]:
#去掉'merchant_seq_len'，反而score有微小上升
clf13, score13 = model_trainer.trainer(lgb, df_train.drop(['merchant_seq_len'], axis=1))
score13

(266857, 219) (29651, 219) (266857,) (29651,)


0.7728701506207754

In [749]:
#xgb, 增加基于customer_id的统计特征，不包含tdidf特征
clf14, score14 = model_trainer.trainer(xgb, df_train)
score14

(266857, 49) (29651, 49) (266857,) (29651,)
Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




0.769401996877237

#### 在以上lgb或xgb建模实践中，效果最好的是clf12，score = 0.7775679278624599。使用的是基于customer_id的统计特征，未使用tdidf特征，用的是lgb模型，使用的是祖传参数，未进行参数调整优化。

In [755]:
#测试集预测，得到结果
test_pred_y12 = clf12.predict_proba(df_test.drop(['id','fraud'], axis=1))
test_pred_y12

array([[9.99996705e-01, 3.29468649e-06],
       [9.99996061e-01, 3.93858322e-06],
       [9.99996106e-01, 3.89357937e-06],
       ...,
       [9.99995360e-01, 4.64000886e-06],
       [9.99995890e-01, 4.10971123e-06],
       [9.99995859e-01, 4.14082942e-06]])

In [759]:
result12 = df_test[['id']]
result12['score'] = test_pred_y12[:,-1]
result12

Unnamed: 0,id,score
296508,0,0.000003
296509,1,0.000004
296510,2,0.000004
296511,3,0.000004
296512,4,0.000005
...,...,...
594638,298130,0.000005
594639,298131,0.000004
594640,298132,0.000005
594641,298133,0.000004


In [760]:
result12.to_csv('test_lgb_ml.csv', index=False)

#### 5.使用神经网络训练
包括开始的初步统计特征，基于customer_id的统计特征以及tdidf特征

In [715]:
train_x, val_x, train_y, val_y = train_test_split(df_train.drop(['id','fraud'], axis=1), df_train['fraud'], test_size=0.1, random_state=2021)

In [721]:
#特征归一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_x_nx = scaler.fit_transform(train_x)
val_x_nx = scaler.transform(val_x)

In [729]:
test_nx =  scaler.transform(df_test.drop(['id','fraud'], axis=1))
#test_nx

In [723]:
#使用神经网络进行训练
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

adam = Adam(lr=0.001)
# 搭建模型 加上正则化项

model = keras.Sequential([
    keras.layers.Dense(250, activation='relu', input_shape=[len(train_x.columns)], kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dropout(0.02),
    keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dropout(0.02),
    keras.layers.Dense(50, kernel_regularizer=regularizers.l2(0.001)), 
    keras.layers.Dropout(0.02),
    keras.layers.Dense(25, kernel_regularizer=regularizers.l2(0.001)), 
    keras.layers.Dropout(0.02),
    keras.layers.Dense(1, activation='sigmoid') # 需要改写成sigmoid
])
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=adam)

model.fit(train_x_nx, train_y, batch_size=10240, epochs=100)

Train on 266857 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/1

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x13fff9e80>

In [724]:
pred_y = model.predict(val_x_nx)
score_nx = average_precision_score(val_y, pred_y)
score_nx

0.8938012458423894

#### 此案例神经网络的score=0.8938012458423894，特征包括开始的初步统计特征，基于customer_id的统计特征以及tdidf特征。结果远远好于上面使用lgb或者xgb的效果。

In [731]:
#测试集预测，得到结果
test_pred_y = model.predict(test_nx)
test_pred_y

array([[5.9306622e-06],
       [4.1335821e-05],
       [7.5399876e-06],
       ...,
       [5.9935919e-05],
       [8.8382949e-05],
       [1.2576350e-05]], dtype=float32)

In [736]:
result = df_test[['id']]
result['score'] = test_pred_y
result

Unnamed: 0,id,score
296508,0,0.000006
296509,1,0.000041
296510,2,0.000008
296511,3,0.000010
296512,4,0.000105
...,...,...
594638,298130,0.000015
594639,298131,0.000014
594640,298132,0.000060
594641,298133,0.000088


In [738]:
result.to_csv('test_nx.csv', index=False)