In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
pd.set_option('max_columns', 200)
pd.set_option('max_colwidth', 200)
pd.set_option('max_rows', 100)
warnings.filterwarnings('ignore')

%matplotlib inline

new_merchant_transactions是过去3个月每个商户的所以交易用户信息，new_merchant_transactions是2个月内特定用户还没有涉及到的商户信息

# 观察数据

In [2]:
new = pd.read_csv('data/new_merchant_transactions.csv')

In [111]:
card_id = new[['card_id']]
card_id.to_csv('data/card_id.csv', index=None)

In [3]:
new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963031 entries, 0 to 1963030
Data columns (total 14 columns):
authorized_flag         object
card_id                 object
city_id                 int64
category_1              object
installments            int64
category_3              object
merchant_category_id    int64
merchant_id             object
month_lag               int64
purchase_amount         float64
purchase_date           object
category_2              float64
state_id                int64
subsector_id            int64
dtypes: float64(2), int64(6), object(6)
memory usage: 209.7+ MB


In [4]:
new.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [5]:
states = []
for col in new.columns:
    states.append((col, new[col].nunique(), new[col].isnull().sum()/new.shape[0], new[col].value_counts(normalize=True, dropna=False).values[0], new[col].dtype))
state_df = pd.DataFrame(states, columns=['Feature', 'Unique values', 'Missing values', 'Percentage of biggest category', 'type'])
state_df.sort_values('Percentage of biggest category', ascending=False)

Unnamed: 0,Feature,Unique values,Missing values,Percentage of biggest category,type
0,authorized_flag,1,0.0,1.0,object
3,category_1,2,0.0,0.967858,object
11,category_2,5,0.056925,0.539086,float64
8,month_lag,2,0.0,0.523485,int64
4,installments,15,0.0,0.469806,int64
5,category_3,3,0.028488,0.469806,object
12,state_id,25,0.0,0.373477,int64
13,subsector_id,41,0.0,0.173229,int64
2,city_id,308,0.0,0.167555,int64
6,merchant_category_id,314,0.0,0.09762,int64


In [6]:
new['installments'].value_counts()

 0      922244
 1      836178
-1       55922
 2       54729
 3       44750
 4       14815
 6       10389
 5        9296
 10       8899
 12       2850
 8        1555
 7         863
 9         478
 11         61
 999         2
Name: installments, dtype: int64

# 数据处理

In [200]:
new = pd.read_csv('data/new_merchant_transactions.csv')

## 时间转换

In [201]:
def purchase_day(x):
    big, small = x.split(' ')
    year, month, day = big.split('-')
    all_day = (int(year)*12+int(month)-24143)*30 + int(day)
    return all_day

def purchase_second(x):
    big, small = x.split(' ')
    hour, minite, second = small.split(':')
    all_second = int(hour)*24*60 + int(minite)*60 + int(second)
    return all_second

In [202]:
new['purchase_day'] = new['purchase_date'].apply(purchase_day)
new['purchase_second'] = new['purchase_date'].apply(purchase_second)
new['purchase_month'] = new['purchase_day']//30 + new['month_lag']
new = new.drop('purchase_date', axis=1)

## 处理异常值

In [203]:
# 处理分期付款的异常值
new['installments'][new['installments'] == 999] = -1

## 同一merchant_id购买记录

用户在同一商家购买的次数

In [212]:
card_feature = new[['card_id']]
card_feature = card_feature.drop_duplicates().reset_index(drop=True)

In [213]:
t = new[['card_id', 'merchant_id']]
t['same_merchant_count'] = 1

In [214]:
t1 = t.groupby(['card_id', 'merchant_id']).count()

以card_id为结合——用户在同一家商家购买的次数的最大、最小、中位数、平均数

In [215]:
t2 = t1.reset_index().groupby('card_id').max()[['same_merchant_count']].reset_index()
t2 = t2.rename(columns={'same_merchant_count': 'new_same_merchant_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_merchant_count']].reset_index()
t3 = t3.rename(columns={'same_merchant_count': 'new_same_merchant_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_merchant_count']].reset_index()
t4 = t4.rename(columns={'same_merchant_count': 'new_same_merchant_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_merchant_count']].reset_index()
t5 = t5.rename(columns={'same_merchant_count': 'new_same_merchant_median'})

In [216]:
t1['new_num_merchant'] = 1
t6 = t1.reset_index()[['card_id', 'new_num_merchant']].groupby('card_id').count().reset_index()

In [217]:
t['new_same_card_count'] = 1
t7 = t[['card_id', 'new_same_card_count']].groupby('card_id').count().reset_index()

In [218]:
card_feature = pd.merge(card_feature, t2, on='card_id', how='left')
card_feature = pd.merge(card_feature, t3, on='card_id', how='left')
card_feature = pd.merge(card_feature, t4, on='card_id', how='left')
card_feature = pd.merge(card_feature, t5, on='card_id', how='left')
card_feature = pd.merge(card_feature, t6, on='card_id', how='left')
card_feature = pd.merge(card_feature, t7, on='card_id', how='left')

In [219]:
card_feature['new_merchant_max_rate'] = card_feature['new_same_merchant_max']/card_feature['new_same_card_count']
card_feature['new_merchant_mean_rate'] = card_feature['new_same_merchant_mean']/card_feature['new_same_card_count']
card_feature['new_merchant_min_rate'] = card_feature['new_same_merchant_min']/card_feature['new_same_card_count']
card_feature['new_merchant_median_rate'] = card_feature['new_same_merchant_median']/card_feature['new_same_card_count']

In [220]:
card_feature.head()

Unnamed: 0,card_id,new_same_merchant_max,new_same_merchant_mean,new_same_merchant_min,new_same_merchant_median,new_num_merchant,new_same_card_count,new_merchant_max_rate,new_merchant_mean_rate,new_merchant_min_rate,new_merchant_median_rate
0,C_ID_415bb3a509,1.0,1.0,1.0,1.0,4.0,4,0.25,0.25,0.25,0.25
1,C_ID_ef55cf8d4b,1.0,1.0,1.0,1.0,22.0,22,0.045455,0.045455,0.045455,0.045455
2,C_ID_241a01e9d9,1.0,1.0,1.0,1.0,2.0,2,0.5,0.5,0.5,0.5
3,C_ID_a97720321f,1.0,1.0,1.0,1.0,12.0,12,0.083333,0.083333,0.083333,0.083333
4,C_ID_fb0875cd28,1.0,1.0,1.0,1.0,7.0,7,0.142857,0.142857,0.142857,0.142857


In [221]:
card_feature.to_csv('data/new_card_feature.csv', index=None)

## 同一城市购买记录

In [222]:
t = new[['city_id', 'merchant_id', 'card_id']]
t['same_city_count'] = 1

In [223]:
t1 = t[['card_id', 'city_id', 'same_city_count']].groupby(['card_id', 'city_id']).count()

In [224]:
t2 = t1.reset_index().groupby('card_id').max()[['same_city_count']].reset_index()
t2 = t2.rename(columns={'same_city_count': 'new_same_city_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_city_count']].reset_index()
t3 = t3.rename(columns={'same_city_count': 'new_same_city_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_city_count']].reset_index()
t4 = t4.rename(columns={'same_city_count': 'new_same_city_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_city_count']].reset_index()
t5 = t5.rename(columns={'same_city_count': 'new_same_city_median'})

In [225]:
t1['new_num_city'] = 1
t6 = t1.reset_index()[['card_id', 'new_num_city']].groupby('card_id').count().reset_index()

In [226]:
t['new_same_card_count'] = 1
t7 = t[['card_id', 'new_same_card_count']].groupby('card_id').count().reset_index()

In [227]:
city_feature = new[['card_id']]
city_feature = city_feature.drop_duplicates().reset_index(drop=True)
city_feature = pd.merge(city_feature, t2, on='card_id', how='left')
city_feature = pd.merge(city_feature, t3, on='card_id', how='left')
city_feature = pd.merge(city_feature, t4, on='card_id', how='left')
city_feature = pd.merge(city_feature, t5, on='card_id', how='left')
city_feature = pd.merge(city_feature, t6, on='card_id', how='left')
city_feature = pd.merge(city_feature, t7, on='card_id', how='left')

In [228]:
city_feature['new_city_max_rate'] = city_feature['new_same_city_max']/city_feature['new_same_card_count']
city_feature['new_city_mean_rate'] = city_feature['new_same_city_mean']/city_feature['new_same_card_count']
city_feature['new_city_min_rate'] = city_feature['new_same_city_min']/city_feature['new_same_card_count']
city_feature['new_city_median_rate'] = city_feature['new_same_city_median']/city_feature['new_same_card_count']
city_feature = city_feature.drop('new_same_card_count', axis=1)

同一家店，不同的城市购买记录

In [57]:
# t['city_merchant_count'] = 1
# t1 = t[['card_id', 'city_id','merchant_id', 'city_merchant_count']].groupby(['card_id', 'merchant_id', 'city_id']).count()
# t1['city_merchant_count'] = 1

In [58]:
# t2 = t1.reset_index()[['card_id', 'merchant_id', 'city_merchant_count']].groupby(['card_id', 'merchant_id']).count()

# t3 = t2.reset_index().groupby('card_id').max()[['city_merchant_count']].reset_index()
# t3 = t3.rename(columns={'city_merchant_count': 'city_merchant_max'})
# t4 = t2.reset_index().groupby('card_id').mean()[['city_merchant_count']].reset_index()
# t4 = t4.rename(columns={'city_merchant_count': 'city_merchant_mean'})

In [229]:
def bigger_1(x):
    if x > 1:
        return 1
    else:
        return 0

In [230]:
# city_feature = pd.merge(city_feature, t3, on='card_id', how='left')
# city_feature = pd.merge(city_feature, t4, on='card_id', how='left')

In [231]:
city_feature.head()

Unnamed: 0,card_id,new_same_city_max,new_same_city_mean,new_same_city_min,new_same_city_median,new_num_city,new_city_max_rate,new_city_mean_rate,new_city_min_rate,new_city_median_rate
0,C_ID_415bb3a509,1,1.0,1,1.0,4,0.25,0.25,0.25,0.25
1,C_ID_ef55cf8d4b,11,7.333333,4,7.0,3,0.5,0.333333,0.181818,0.318182
2,C_ID_241a01e9d9,1,1.0,1,1.0,2,0.5,0.5,0.5,0.5
3,C_ID_a97720321f,11,6.0,1,6.0,2,0.916667,0.5,0.083333,0.5
4,C_ID_fb0875cd28,3,1.75,1,1.5,4,0.428571,0.25,0.142857,0.214286


In [232]:
city_feature.to_csv('data/new_city_feature.csv', index=None)

## 同一merchant_category_id购买记录

In [233]:
category_feature = new[['card_id']]
category_feature = category_feature.drop_duplicates().reset_index(drop=True)

In [234]:
t = new[['merchant_id', 'merchant_category_id', 'card_id']]
t['same_category_count'] = 1

In [235]:
t1 = t[['card_id', 'merchant_category_id', 'same_category_count']].groupby(['card_id', 'merchant_category_id']).count()

In [236]:
t2 = t1.reset_index().groupby('card_id').max()[['same_category_count']].reset_index()
t2 = t2.rename(columns={'same_category_count': 'new_same_category_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_category_count']].reset_index()
t3 = t3.rename(columns={'same_category_count': 'new_same_category_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_category_count']].reset_index()
t4 = t4.rename(columns={'same_category_count': 'new_same_category_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_category_count']].reset_index()
t5 = t5.rename(columns={'same_category_count': 'new_same_category_median'})

In [237]:
t1['new_num_category'] = 1
t6 = t1.reset_index()[['card_id', 'new_num_category']].groupby('card_id').count().reset_index()

In [238]:
t['new_same_card_count'] = 1
t7 = t[['card_id', 'new_same_card_count']].groupby('card_id').count().reset_index()

In [239]:
category_feature = pd.merge(category_feature, t2, on='card_id', how='left')
category_feature = pd.merge(category_feature, t3, on='card_id', how='left')
category_feature = pd.merge(category_feature, t4, on='card_id', how='left')
category_feature = pd.merge(category_feature, t5, on='card_id', how='left')
category_feature = pd.merge(category_feature, t6, on='card_id', how='left')
category_feature = pd.merge(category_feature, t7, on='card_id', how='left')

In [240]:
category_feature['new_category_max_rate'] = category_feature['new_same_category_max']/category_feature['new_same_card_count']
category_feature['new_category_mean_rate'] = category_feature['new_same_category_mean']/category_feature['new_same_card_count']
category_feature['new_category_min_rate'] = category_feature['new_same_category_min']/category_feature['new_same_card_count']
category_feature['new_category_median_rate'] = category_feature['new_same_category_median']/category_feature['new_same_card_count']
category_feature = category_feature.drop('new_same_card_count', axis=1)

In [241]:
t['category_merchant_count'] = 1
t1 = t[['card_id', 'merchant_category_id','merchant_id', 'category_merchant_count']].groupby(['card_id','merchant_category_id', 'merchant_id']).count()
t1['category_merchant_count'] = 1

In [242]:
t2 = t1.reset_index().groupby(['card_id', 'merchant_category_id']).count()[['category_merchant_count']]

t3 = t2.reset_index().groupby('card_id').max()[['category_merchant_count']].reset_index()
t3 = t3.rename(columns={'category_merchant_count': 'new_category_merchant_max'})
t4 = t2.reset_index().groupby('card_id').mean()[['category_merchant_count']].reset_index()
t4 = t4.rename(columns={'category_merchant_count': 'new_category_merchant_mean'})

In [243]:
t5 = pd.DataFrame(t2['category_merchant_count'].apply(bigger_1).groupby('card_id').sum()).reset_index()
t5 = t5.rename(columns={'category_merchant_count': 'new_same_category_num_merchant'})

In [244]:
category_feature = pd.merge(category_feature, t3, on='card_id', how='left')
category_feature = pd.merge(category_feature, t4, on='card_id', how='left')
category_feature = pd.merge(category_feature, t5, on='card_id', how='left')

In [245]:
category_feature.head()

Unnamed: 0,card_id,new_same_category_max,new_same_category_mean,new_same_category_min,new_same_category_median,new_num_category,new_category_max_rate,new_category_mean_rate,new_category_min_rate,new_category_median_rate,new_category_merchant_max,new_category_merchant_mean,new_same_category_num_merchant
0,C_ID_415bb3a509,2,1.333333,1,1.0,3,0.5,0.333333,0.25,0.25,2.0,1.333333,1.0
1,C_ID_ef55cf8d4b,3,1.222222,1,1.0,18,0.136364,0.055556,0.045455,0.045455,3.0,1.222222,3.0
2,C_ID_241a01e9d9,1,1.0,1,1.0,2,0.5,0.5,0.5,0.5,1.0,1.0,0.0
3,C_ID_a97720321f,2,1.090909,1,1.0,11,0.166667,0.090909,0.083333,0.083333,2.0,1.090909,1.0
4,C_ID_fb0875cd28,1,1.0,1,1.0,7,0.142857,0.142857,0.142857,0.142857,1.0,1.0,0.0


In [246]:
category_feature.to_csv('data/new_category_feature.csv', index=None)

## 同一subsector_id

In [247]:
subsector_feature = new[['card_id']]
subsector_feature = subsector_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','subsector_id', 'card_id']]
t['same_subsector_count'] = 1
t1 = t[['card_id', 'subsector_id', 'same_subsector_count']].groupby(['card_id', 'subsector_id']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_subsector_count']].reset_index()
t2 = t2.rename(columns={'same_subsector_count': 'new_same_subsector_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_subsector_count']].reset_index()
t3 = t3.rename(columns={'same_subsector_count': 'new_same_subsector_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_subsector_count']].reset_index()
t4 = t4.rename(columns={'same_subsector_count': 'new_same_subsector_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_subsector_count']].reset_index()
t5 = t5.rename(columns={'same_subsector_count': 'new_same_subsector_median'})

t1['new_num_subsector'] = 1
t6 = t1.reset_index()[['card_id', 'new_num_subsector']].groupby('card_id').count().reset_index()
t['new_same_card_count'] = 1
t7 = t[['card_id', 'new_same_card_count']].groupby('card_id').count().reset_index()

In [248]:
subsector_feature = pd.merge(subsector_feature, t2, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t3, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t4, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t5, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t6, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t7, on='card_id', how='left')

In [249]:
subsector_feature['new_subsector_max_rate'] = subsector_feature['new_same_subsector_max']/subsector_feature['new_same_card_count']
subsector_feature['new_subsector_mean_rate'] = subsector_feature['new_same_subsector_mean']/subsector_feature['new_same_card_count']
subsector_feature['new_subsector_min_rate'] = subsector_feature['new_same_subsector_min']/subsector_feature['new_same_card_count']
subsector_feature['new_subsector_median_rate'] = subsector_feature['new_same_subsector_median']/subsector_feature['new_same_card_count']
subsector_feature = subsector_feature.drop('new_same_card_count', axis=1)

In [250]:
t['subsector_merchant_count'] = 1
t1 = t[['card_id', 'subsector_id','merchant_id', 'subsector_merchant_count']].groupby(['card_id','subsector_id', 'merchant_id']).count()
t1['subsector_merchant_count'] = 1

In [251]:
t2 = t1.reset_index().groupby(['card_id', 'subsector_id']).count()[['subsector_merchant_count']]

t3 = t2.reset_index().groupby('card_id').max()[['subsector_merchant_count']].reset_index()
t3 = t3.rename(columns={'subsector_merchant_count': 'new_subsector_merchant_max'})
t4 = t2.reset_index().groupby('card_id').mean()[['subsector_merchant_count']].reset_index()
t4 = t4.rename(columns={'subsector_merchant_count': 'new_subsector_merchant_mean'})
t5 = pd.DataFrame(t2['subsector_merchant_count'].apply(bigger_1).groupby('card_id').sum()).reset_index()
t5 = t5.rename(columns={'subsector_merchant_count': 'new_same_subsector_num_merchant'})

In [252]:
subsector_feature = pd.merge(subsector_feature, t3, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t4, on='card_id', how='left')
subsector_feature = pd.merge(subsector_feature, t5, on='card_id', how='left')

In [253]:
subsector_feature.head()

Unnamed: 0,card_id,new_same_subsector_max,new_same_subsector_mean,new_same_subsector_min,new_same_subsector_median,new_num_subsector,new_subsector_max_rate,new_subsector_mean_rate,new_subsector_min_rate,new_subsector_median_rate,new_subsector_merchant_max,new_subsector_merchant_mean,new_same_subsector_num_merchant
0,C_ID_415bb3a509,2,1.333333,1,1.0,3,0.5,0.333333,0.25,0.25,2.0,1.333333,1.0
1,C_ID_ef55cf8d4b,4,1.833333,1,1.5,12,0.181818,0.083333,0.045455,0.068182,4.0,1.833333,6.0
2,C_ID_241a01e9d9,1,1.0,1,1.0,2,0.5,0.5,0.5,0.5,1.0,1.0,0.0
3,C_ID_a97720321f,2,1.333333,1,1.0,9,0.166667,0.111111,0.083333,0.083333,2.0,1.333333,3.0
4,C_ID_fb0875cd28,3,1.4,1,1.0,5,0.428571,0.2,0.142857,0.142857,3.0,1.4,1.0


In [254]:
subsector_feature.to_csv('data/new_subsector_feature.csv', index=None)

## 同一state_id购买记录

In [255]:
state_feature = new[['card_id']]
state_feature = state_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','state_id', 'card_id']]
t['same_state_count'] = 1
t1 = t[['card_id', 'state_id', 'same_state_count']].groupby(['card_id', 'state_id']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_state_count']].reset_index()
t2 = t2.rename(columns={'same_state_count': 'new_same_state_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_state_count']].reset_index()
t3 = t3.rename(columns={'same_state_count': 'new_same_state_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_state_count']].reset_index()
t4 = t4.rename(columns={'same_state_count': 'new_same_state_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_state_count']].reset_index()
t5 = t5.rename(columns={'same_state_count': 'new_same_state_median'})

t1['new_num_state'] = 1
t6 = t1.reset_index()[['card_id', 'new_num_state']].groupby('card_id').count().reset_index()
t['same_card_count'] = 1
t7 = t[['card_id', 'same_card_count']].groupby('card_id').count().reset_index()

state_feature = pd.merge(state_feature, t2, on='card_id', how='left')
state_feature = pd.merge(state_feature, t3, on='card_id', how='left')
state_feature = pd.merge(state_feature, t4, on='card_id', how='left')
state_feature = pd.merge(state_feature, t5, on='card_id', how='left')
state_feature = pd.merge(state_feature, t6, on='card_id', how='left')
state_feature = pd.merge(state_feature, t7, on='card_id', how='left')

state_feature['new_state_max_rate'] = state_feature['new_same_state_max']/state_feature['same_card_count']
state_feature['new_state_mean_rate'] = state_feature['new_same_state_mean']/state_feature['same_card_count']
state_feature['new_state_min_rate'] = state_feature['new_same_state_min']/state_feature['same_card_count']
state_feature['new_state_median_rate'] = state_feature['new_same_state_median']/state_feature['same_card_count']
state_feature = state_feature.drop('same_card_count', axis=1)

In [256]:
t['state_merchant_count'] = 1
t1 = t[['card_id', 'state_id','merchant_id', 'state_merchant_count']].groupby(['card_id','state_id', 'merchant_id']).count()
t1['state_merchant_count'] = 1

t2 = t1.reset_index().groupby(['card_id', 'state_id']).count()[['state_merchant_count']]

t3 = t2.reset_index().groupby('card_id').max()[['state_merchant_count']].reset_index()
t3 = t3.rename(columns={'state_merchant_count': 'new_state_merchant_max'})
t4 = t2.reset_index().groupby('card_id').mean()[['state_merchant_count']].reset_index()
t4 = t4.rename(columns={'state_merchant_count': 'new_state_merchant_mean'})
t5 = pd.DataFrame(t2['state_merchant_count'].apply(bigger_1).groupby('card_id').sum()).reset_index()
t5 = t5.rename(columns={'state_merchant_count': 'new_same_state_num_merchant'})

state_feature = pd.merge(state_feature, t3, on='card_id', how='left')
state_feature = pd.merge(state_feature, t4, on='card_id', how='left')
state_feature = pd.merge(state_feature, t5, on='card_id', how='left')

In [257]:
state_feature.head()

Unnamed: 0,card_id,new_same_state_max,new_same_state_mean,new_same_state_min,new_same_state_median,new_num_state,new_state_max_rate,new_state_mean_rate,new_state_min_rate,new_state_median_rate,new_state_merchant_max,new_state_merchant_mean,new_same_state_num_merchant
0,C_ID_415bb3a509,3,2.0,1,2.0,2,0.75,0.5,0.25,0.5,3.0,2.0,1.0
1,C_ID_ef55cf8d4b,18,11.0,4,11.0,2,0.818182,0.5,0.181818,0.5,18.0,11.0,2.0
2,C_ID_241a01e9d9,2,2.0,2,2.0,1,1.0,1.0,1.0,1.0,2.0,2.0,1.0
3,C_ID_a97720321f,11,6.0,1,6.0,2,0.916667,0.5,0.083333,0.5,11.0,6.0,1.0
4,C_ID_fb0875cd28,7,7.0,7,7.0,1,1.0,1.0,1.0,1.0,7.0,7.0,1.0


In [258]:
state_feature.to_csv('data/new_state_feature.csv', index=None)

## Purchase_day特征

此处的purchase_day是一个相对的时间，在与train文件结合的时候要换算成train的相对时间

In [259]:
t = new[['card_id', 'purchase_day', 'merchant_id']]
t1 = t[['card_id', 'purchase_day']].groupby(['card_id']).max()
t1 = t1.rename(columns={'purchase_day': 'new_purchase_day_max'})
t2 = t[['card_id', 'purchase_day']].groupby(['card_id']).min()
t2 = t2.rename(columns={'purchase_day': 'new_purchase_day_min'})

In [260]:
t3 = pd.DataFrame(t1['new_purchase_day_max'] - t2['new_purchase_day_min'], columns=['new_purchase_day_diff'])

In [261]:
def bigger_0(x):
    if x > 0:
        return 1
    else:
        return 0

In [262]:
day_feature = new[['card_id']]
day_feature = day_feature.drop_duplicates().reset_index(drop=True)
day_feature = pd.merge(day_feature, t1, on='card_id', how='left')
day_feature = pd.merge(day_feature, t2, on='card_id', how='left')
day_feature = pd.merge(day_feature, t3, on='card_id', how='left')

In [263]:
day_feature.head()

Unnamed: 0,card_id,new_purchase_day_max,new_purchase_day_min,new_purchase_day_diff
0,C_ID_415bb3a509,2336,2287,49
1,C_ID_ef55cf8d4b,2332,2281,51
2,C_ID_241a01e9d9,2332,2332,0
3,C_ID_a97720321f,2323,2281,42
4,C_ID_fb0875cd28,2329,2285,44


In [264]:
day_feature.to_csv('data/new_day_feature.csv', index=None)

## purchase_month 特征

In [265]:
t = new[['card_id', 'purchase_day', 'merchant_id']]
t['purchase_month'] = t['purchase_day']//30
t1 = t[['card_id', 'purchase_month']].groupby(['card_id']).max()
t1 = t1.rename(columns={'purchase_month': 'new_purchase_month_max'})
t2 = t[['card_id', 'purchase_month']].groupby(['card_id']).min()
t2 = t2.rename(columns={'purchase_month': 'new_purchase_month_min'})
t3 = pd.DataFrame(t1['new_purchase_month_max'] - t2['new_purchase_month_min'], columns=['new_purchase_month_diff'])

In [266]:
month_feature = new[['card_id']]
month_feature = month_feature.drop_duplicates().reset_index(drop=True)
month_feature = pd.merge(month_feature, t1, on='card_id', how='left')
month_feature = pd.merge(month_feature, t2, on='card_id', how='left')
month_feature = pd.merge(month_feature, t3, on='card_id', how='left')

In [267]:
month_feature.head()

Unnamed: 0,card_id,new_purchase_month_max,new_purchase_month_min,new_purchase_month_diff
0,C_ID_415bb3a509,77,76,1
1,C_ID_ef55cf8d4b,77,76,1
2,C_ID_241a01e9d9,77,77,0
3,C_ID_a97720321f,77,76,1
4,C_ID_fb0875cd28,77,76,1


In [268]:
month_feature.to_csv('data/new_month_feature.csv', index=None)

purchase_second特征（购买方式）

In [142]:
# t = new[['card_id', 'purchase_second']]
# t['second_count'] = 1
# t1 = t.groupby(['card_id', 'purchase_second']).count().reset_index()

# t2 = t1.reset_index().groupby('card_id').max()[['second_count']].reset_index()
# t2 = t2.rename(columns={'second_count': 'second_max'})
# t3 = t1.reset_index().groupby('card_id').mean()[['second_count']].reset_index()
# t3 = t3.rename(columns={'second_count': 'second_mean'})
# t4 = t1.reset_index().groupby('card_id').max().reset_index()
# t4 = t4[['card_id', 'purchase_second']][t4['second_count'] != 1]
# t4 = t4.rename(columns={'purchase_second': 'max_second_time'})

In [143]:
# second_feature = new[['card_id']]
# second_feature = second_feature.drop_duplicates().reset_index(drop=True)
# second_feature = pd.merge(second_feature, t2, on='card_id', how='left')
# second_feature = pd.merge(second_feature, t3, on='card_id', how='left')
# second_feature = pd.merge(second_feature, t4, on='card_id', how='left')

In [148]:
# second_feature.head()

In [79]:
# second_feature.to_csv('data/new_second_feature.csv', index=None)

## installments特征

In [269]:
t = new[['card_id', 'installments']]
t['installments_count'] = 1
t1 = t.groupby(['card_id', 'installments']).count().reset_index()

t2 = t1.reset_index().groupby('card_id').max()[['installments_count']].reset_index()
t2 = t2.rename(columns={'installments_count': 'new_install_num_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['installments_count']].reset_index()
t3 = t3.rename(columns={'installments_count': 'new_install_num_mean'})
t4 = t1.reset_index().groupby('card_id').median()[['installments_count']].reset_index()
t4 = t4.rename(columns={'installments_count': 'new_install_num_median'})
t5 = t1.reset_index().groupby('card_id').max().reset_index()
t5 = t5[['card_id', 'installments']][t5['installments_count'] != -1]
t5 = t5.rename(columns={'installments': 'new_max_installments'})
t6 = t1.reset_index().groupby('card_id').median().reset_index()
t6 = t6[['card_id', 'installments']][t6['installments_count'] != -1]
t6 = t6.rename(columns={'installments': 'new_median_installments'})

t7 = t.groupby('card_id').mean()[['installments']].reset_index()
t7 = t7.rename(columns={'installments': 'new_mean_installments_value'})
t8 = t.groupby('card_id').max()[['installments']].reset_index()
t8 = t8.rename(columns={'installments': 'new_max_installments_value'})
t9 = t.groupby('card_id').min()[['installments']].reset_index()
t9 = t9.rename(columns={'installments': 'new_min_installments_value'})
t10 = t.groupby('card_id').median()[['installments']].reset_index()
t10 = t10.rename(columns={'installments': 'new_median_installments_value'})

In [270]:
installments_feature = new[['card_id']]
installments_feature = installments_feature.drop_duplicates().reset_index(drop=True)
installments_feature = pd.merge(installments_feature, t2, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t3, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t4, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t5, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t6, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t7, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t8, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t9, on='card_id', how='left')
installments_feature = pd.merge(installments_feature, t10, on='card_id', how='left')

In [271]:
installments_feature.head()

Unnamed: 0,card_id,new_install_num_max,new_install_num_mean,new_install_num_median,new_max_installments,new_median_installments,new_mean_installments_value,new_max_installments_value,new_min_installments_value,new_median_installments_value
0,C_ID_415bb3a509,4,4.0,4.0,1,1.0,1.0,1,1,1.0
1,C_ID_ef55cf8d4b,17,4.4,1.0,12,2.0,1.636364,12,-1,1.0
2,C_ID_241a01e9d9,1,1.0,1.0,6,4.5,4.5,6,3,4.5
3,C_ID_a97720321f,12,12.0,12.0,0,0.0,0.0,0,0,0.0
4,C_ID_fb0875cd28,7,7.0,7.0,0,0.0,0.0,0,0,0.0


In [272]:
installments_feature.to_csv('data/new_installments_feature.csv', index=None)

## purchase_amount特征

In [273]:
t = new[['card_id', 'purchase_amount']]
t['purchase_count'] = 1
t1 = t.groupby(['card_id', 'purchase_amount']).count().reset_index()

t2 = t1.reset_index().groupby('card_id').max()[['purchase_count']].reset_index()
t2 = t2.rename(columns={'purchase_count': 'new_purchase_num_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['purchase_count']].reset_index()
t3 = t3.rename(columns={'purchase_count': 'new_purchase_num_mean'})
t4 = t1.reset_index().groupby('card_id').median()[['purchase_count']].reset_index()
t4 = t4.rename(columns={'purchase_count': 'new_purchase_num_median'})
t4 = t1.reset_index().groupby('card_id').min()[['purchase_count']].reset_index()
t4 = t4.rename(columns={'purchase_count': 'new_purchase_num_min'})
t5 = t1.reset_index().groupby('card_id').max().reset_index()
t5 = t5[['card_id', 'purchase_amount']][t5['purchase_count'] != -1]
t5 = t5.rename(columns={'purchase_amount': 'new_max_purchase'})
t6 = t1.reset_index().groupby('card_id').median().reset_index()
t6 = t6[['card_id', 'purchase_amount']][t6['purchase_count'] != -1]
t6 = t6.rename(columns={'purchase_amount': 'new_median_purchase'})

In [274]:
t7 = t.groupby('card_id').mean()[['purchase_amount']].reset_index()
t7 = t7.rename(columns={'purchase_amount': 'new_mean_purchase_amount_value'})
t8 = t.groupby('card_id').max()[['purchase_amount']].reset_index()
t8 = t8.rename(columns={'purchase_amount': 'new_max_purchase_amount_value'})
t9 = t.groupby('card_id').min()[['purchase_amount']].reset_index()
t9 = t9.rename(columns={'purchase_amount': 'new_min_purchase_amount_value'})
t10 = t.groupby('card_id').median()[['purchase_amount']].reset_index()
t10 = t10.rename(columns={'purchase_amount': 'new_median_purchase_amount_value'})

In [275]:
purchase_feature = new[['card_id']]
purchase_feature = purchase_feature.drop_duplicates().reset_index(drop=True)
purchase_feature = pd.merge(purchase_feature, t2, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t3, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t4, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t5, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t6, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t7, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t8, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t9, on='card_id', how='left')
purchase_feature = pd.merge(purchase_feature, t10, on='card_id', how='left')

In [276]:
purchase_feature.head()

Unnamed: 0,card_id,new_purchase_num_max,new_purchase_num_mean,new_purchase_num_min,new_max_purchase,new_median_purchase,new_mean_purchase_amount_value,new_max_purchase_amount_value,new_min_purchase_amount_value,new_median_purchase_amount_value
0,C_ID_415bb3a509,1,1.0,1,-0.551037,-0.563577,-0.587529,-0.551037,-0.671925,-0.563577
1,C_ID_ef55cf8d4b,1,1.0,1,5.263697,-0.601301,0.256985,5.263697,-0.704834,-0.601301
2,C_ID_241a01e9d9,1,1.0,1,-0.29476,-0.378232,-0.378232,-0.29476,-0.461705,-0.378232
3,C_ID_a97720321f,1,1.0,1,0.755743,-0.464124,-0.376239,0.755743,-0.718132,-0.464124
4,C_ID_fb0875cd28,1,1.0,1,-0.565087,-0.671775,-0.65978,-0.565087,-0.712347,-0.671775


In [277]:
purchase_feature.to_csv('data/new_purchase_feature.csv', index=None)

## purchase_installments联合特征

In [278]:
t = new[['card_id', 'installments', 'purchase_amount']]
t1 = t[['card_id', 'purchase_amount']].groupby(['card_id']).max()[['purchase_amount']]
t1 = t1.rename(columns={'purchase_amount': 'purchase_max'})
t1['install_max'] = t[['card_id', 'installments']].groupby(['card_id']).max()[['installments']]

In [279]:
def change_not_0(y):
    if y == 0 or y == -1:
        return 1
    else:
        return y

In [280]:
t1['install_max'] = t1['install_max'].apply(change_not_0)
t1['new_every_max_install_amount'] = t1['purchase_max']/t1['install_max']
t1 = t1[['new_every_max_install_amount']]

In [281]:
t2 = t[['card_id', 'purchase_amount']].groupby(['card_id']).mean()[['purchase_amount']]
t2 = t2.rename(columns={'purchase_amount': 'purchase_mean'})
t2['install_mean'] = t[['card_id', 'installments']].groupby(['card_id']).mean()[['installments']]
t2['install_mean'] = t2['install_mean'].apply(change_not_0)
t2['new_every_mean_install_amount'] = t2['purchase_mean']/t2['install_mean']
t2 = t2[['new_every_mean_install_amount']]

In [282]:
t3 = t[['card_id', 'purchase_amount']].groupby(['card_id']).median()[['purchase_amount']]
t3 = t3.rename(columns={'purchase_amount': 'purchase_median'})
t3['install_median'] = t[['card_id', 'installments']].groupby(['card_id']).median()[['installments']]
t3['install_median'] = t3['install_median'].apply(change_not_0)
t3['new_every_median_install_amount'] = t3['purchase_median']/t3['install_median']
t3 = t3[['new_every_median_install_amount']]

In [283]:
install_purchase_feature = new[['card_id']]
install_purchase_feature = install_purchase_feature.drop_duplicates().reset_index(drop=True)
install_purchase_feature = pd.merge(install_purchase_feature, t1, on='card_id', how='left')
install_purchase_feature = pd.merge(install_purchase_feature, t2, on='card_id', how='left')
install_purchase_feature = pd.merge(install_purchase_feature, t3, on='card_id', how='left')

In [284]:
install_purchase_feature.head()

Unnamed: 0,card_id,new_every_max_install_amount,new_every_mean_install_amount,new_every_median_install_amount
0,C_ID_415bb3a509,-0.551037,-0.587529,-0.563577
1,C_ID_ef55cf8d4b,0.438641,0.157046,-0.601301
2,C_ID_241a01e9d9,-0.049127,-0.084052,-0.084052
3,C_ID_a97720321f,0.755743,-0.376239,-0.464124
4,C_ID_fb0875cd28,-0.565087,-0.65978,-0.671775


In [285]:
install_purchase_feature.to_csv('data/new_install_purchase_feature.csv', index=None)

## category特征

In [286]:
category_features = ['category_1', 'category_2', 'category_3']
for col in category_features:
    new[col] = new[col].map(dict(zip(new[col].unique(), range(0, new[col].nunique()))))

In [287]:
category_1_feature = new[['card_id']]
category_1_feature = category_1_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','category_1', 'card_id']]
t['same_category_1_count'] = 1
t1 = t[['card_id', 'category_1', 'same_category_1_count']].groupby(['card_id', 'category_1']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_category_1_count']].reset_index()
t2 = t2.rename(columns={'same_category_1_count': 'new_same_category_1_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_category_1_count']].reset_index()
t3 = t3.rename(columns={'same_category_1_count': 'new_same_category_1_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_category_1_count']].reset_index()
t4 = t4.rename(columns={'same_category_1_count': 'new_same_category_1_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_category_1_count']].reset_index()
t5 = t5.rename(columns={'same_category_1_count': 'new_same_category_1_median'})

t6 = t[['card_id', 'category_1']].groupby('card_id').mean().reset_index()
t6 = t6.rename(columns={'category_1': 'new_mean_category_1_value'})
t7 = t[['card_id', 'category_1']].groupby('card_id').max().reset_index()
t7 = t7.rename(columns={'category_1': 'new_max_category_1_value'})
t8 = t[['card_id', 'category_1']].groupby('card_id').min().reset_index()
t8 = t8.rename(columns={'category_1': 'new_min_category_1_value'})
t9 = t[['card_id', 'category_1']].groupby('card_id').median().reset_index()
t9 = t9.rename(columns={'category_1': 'new_median_category_1_value'})

In [288]:
category_1_feature = new[['card_id']]
category_1_feature = category_1_feature.drop_duplicates().reset_index(drop=True)
category_1_feature = pd.merge(category_1_feature, t2, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t3, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t4, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t5, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t6, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t7, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t8, on='card_id', how='left')
category_1_feature = pd.merge(category_1_feature, t9, on='card_id', how='left')

In [289]:
category_1_feature.head()

Unnamed: 0,card_id,new_same_category_1_max,new_same_category_1_mean,new_same_category_1_min,new_same_category_1_median,new_mean_category_1_value,new_max_category_1_value,new_min_category_1_value,new_median_category_1_value
0,C_ID_415bb3a509,3,2.0,1,2.0,0.25,1,0,0.0
1,C_ID_ef55cf8d4b,18,11.0,4,11.0,0.181818,1,0,0.0
2,C_ID_241a01e9d9,2,2.0,2,2.0,0.0,0,0,0.0
3,C_ID_a97720321f,12,12.0,12,12.0,0.0,0,0,0.0
4,C_ID_fb0875cd28,7,7.0,7,7.0,0.0,0,0,0.0


In [290]:
category_1_feature.to_csv('data/new_category_1_feature.csv', index=None)

In [291]:
category_2_feature = new[['card_id']]
category_2_feature = category_2_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','category_2', 'card_id']]
t['same_category_2_count'] = 1
t1 = t[['card_id', 'category_2', 'same_category_2_count']].groupby(['card_id', 'category_2']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_category_2_count']].reset_index()
t2 = t2.rename(columns={'same_category_2_count': 'new_same_category_2_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_category_2_count']].reset_index()
t3 = t3.rename(columns={'same_category_2_count': 'new_same_category_2_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_category_2_count']].reset_index()
t4 = t4.rename(columns={'same_category_2_count': 'new_same_category_2_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_category_2_count']].reset_index()
t5 = t5.rename(columns={'same_category_2_count': 'new_same_category_2_median'})

t6 = t[['card_id', 'category_2']].groupby('card_id').mean().reset_index()
t6 = t6.rename(columns={'category_2': 'new_mean_category_2_value'})
t7 = t[['card_id', 'category_2']].groupby('card_id').max().reset_index()
t7 = t7.rename(columns={'category_2': 'new_max_category_2_value'})
t8 = t[['card_id', 'category_2']].groupby('card_id').min().reset_index()
t8 = t8.rename(columns={'category_2': 'new_min_category_2_value'})
t9 = t[['card_id', 'category_2']].groupby('card_id').median().reset_index()
t9 = t9.rename(columns={'category_2': 'new_median_category_2_value'})

In [292]:
category_2_feature = new[['card_id']]
category_2_feature = category_2_feature.drop_duplicates().reset_index(drop=True)
category_2_feature = pd.merge(category_2_feature, t2, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t3, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t4, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t5, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t6, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t7, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t8, on='card_id', how='left')
category_2_feature = pd.merge(category_2_feature, t9, on='card_id', how='left')

In [293]:
category_2_feature.to_csv('data/new_category_2_feature.csv', index=None)

In [294]:
t = new[['merchant_id','category_3', 'card_id']]
t['same_category_3_count'] = 1
t1 = t[['card_id', 'category_3', 'same_category_3_count']].groupby(['card_id', 'category_3']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_category_3_count']].reset_index()
t2 = t2.rename(columns={'same_category_3_count': 'new_same_category_3_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_category_3_count']].reset_index()
t3 = t3.rename(columns={'same_category_3_count': 'new_same_category_3_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_category_3_count']].reset_index()
t4 = t4.rename(columns={'same_category_3_count': 'new_same_category_3_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_category_3_count']].reset_index()
t5 = t5.rename(columns={'same_category_3_count': 'new_same_category_3_median'})

t6 = t[['card_id', 'category_3']].groupby('card_id').mean().reset_index()
t6 = t6.rename(columns={'category_3': 'new_mean_category_3_value'})
t7 = t[['card_id', 'category_3']].groupby('card_id').max().reset_index()
t7 = t7.rename(columns={'category_3': 'new_max_category_3_value'})
t8 = t[['card_id', 'category_3']].groupby('card_id').min().reset_index()
t8 = t8.rename(columns={'category_3': 'new_min_category_3_value'})
t9 = t[['card_id', 'category_3']].groupby('card_id').median().reset_index()
t9 = t9.rename(columns={'category_3': 'new_median_category_3_value'})

In [295]:
category_3_feature = new[['card_id']]
category_3_feature = category_3_feature.drop_duplicates().reset_index(drop=True)
category_3_feature = pd.merge(category_3_feature, t2, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t3, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t4, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t5, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t6, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t7, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t8, on='card_id', how='left')
category_3_feature = pd.merge(category_3_feature, t9, on='card_id', how='left')

In [296]:
category_3_feature.to_csv('data/new_category_3_feature.csv', index=None)

## authorized特征——值得深入

In [297]:
new['authorized_flag'] = new['authorized_flag'].map(dict(zip(new['authorized_flag'].unique(), range(0, new['authorized_flag'].nunique()))))

In [298]:
authorized_feature = new[['card_id']]
authorized_feature = authorized_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','authorized_flag', 'card_id']]
t['same_authorized_count'] = 1
t1 = t[['card_id', 'authorized_flag', 'same_authorized_count']].groupby(['card_id', 'authorized_flag']).sum()

t2 = t1.reset_index().groupby('card_id').max()[['same_authorized_count']].reset_index()
t2 = t2.rename(columns={'same_authorized_count': 'new_same_authorized_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_authorized_count']].reset_index()
t3 = t3.rename(columns={'same_authorized_count': 'new_same_authorized_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_authorized_count']].reset_index()
t4 = t4.rename(columns={'same_authorized_count': 'new_same_authorized_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_authorized_count']].reset_index()
t5 = t5.rename(columns={'same_authorized_count': 'new_same_authorized_median'})

t6 = t[['card_id', 'authorized_flag']].groupby('card_id').mean().reset_index()
t6 = t6.rename(columns={'authorized_flag': 'new_mean_authorized_value'})
t7 = t[['card_id', 'authorized_flag']].groupby('card_id').max().reset_index()
t7 = t7.rename(columns={'authorized_flag': 'new_max_authorized_value'})
t8 = t[['card_id', 'authorized_flag']].groupby('card_id').median().reset_index()
t8 = t8.rename(columns={'authorized_flag': 'new_median_authorized_value'})

In [299]:
authorized_feature = pd.merge(authorized_feature, t2, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t3, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t4, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t5, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t6, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t7, on='card_id', how='left')
authorized_feature = pd.merge(authorized_feature, t8, on='card_id', how='left')

In [300]:
authorized_feature.to_csv('data/new_authorized_feature.csv', index=None)

## Month_lag特征——值得深入

In [301]:
month_lag_feature = new[['card_id']]
month_lag_feature = month_lag_feature.drop_duplicates().reset_index(drop=True)
t = new[['merchant_id','month_lag', 'card_id']]
t['same_month_lag_count'] = 1
t1 = t[['card_id', 'month_lag', 'same_month_lag_count']].groupby(['card_id', 'month_lag']).count()

t2 = t1.reset_index().groupby('card_id').max()[['same_month_lag_count']].reset_index()
t2 = t2.rename(columns={'same_month_lag_count': 'new_same_month_lag_max'})
t3 = t1.reset_index().groupby('card_id').mean()[['same_month_lag_count']].reset_index()
t3 = t3.rename(columns={'same_month_lag_count': 'new_same_month_lag_mean'})
t4 = t1.reset_index().groupby('card_id').min()[['same_month_lag_count']].reset_index()
t4 = t4.rename(columns={'same_month_lag_count': 'new_same_month_lag_min'})
t5 = t1.reset_index().groupby('card_id').median()[['same_month_lag_count']].reset_index()
t5 = t5.rename(columns={'same_month_lag_count': 'new_same_month_lag_median'})

t6 = t[['card_id', 'month_lag']].groupby('card_id').mean().reset_index()
t6 = t6.rename(columns={'month_lag': 'new_mean_month_lag_value'})
t7 = t[['card_id', 'month_lag']].groupby('card_id').max().reset_index()
t7 = t7.rename(columns={'month_lag': 'new_max_month_lag_value'})
t8 = t[['card_id', 'month_lag']].groupby('card_id').min().reset_index()
t8 = t8.rename(columns={'month_lag': 'new_min_month_lag_value'})
t9 = t[['card_id', 'month_lag']].groupby('card_id').median().reset_index()
t9 = t9.rename(columns={'month_lag': 'new_median_month_lag_value'})

In [302]:
month_lag_feature = pd.merge(month_lag_feature, t2, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t3, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t4, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t5, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t6, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t7, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t8, on='card_id', how='left')
month_lag_feature = pd.merge(month_lag_feature, t9, on='card_id', how='left')

In [303]:
month_lag_feature.head()

Unnamed: 0,card_id,new_same_month_lag_max,new_same_month_lag_mean,new_same_month_lag_min,new_same_month_lag_median,new_mean_month_lag_value,new_max_month_lag_value,new_min_month_lag_value,new_median_month_lag_value
0,C_ID_415bb3a509,3,2.0,1,2.0,1.25,2,1,1.0
1,C_ID_ef55cf8d4b,15,11.0,7,11.0,1.318182,2,1,1.0
2,C_ID_241a01e9d9,2,2.0,2,2.0,2.0,2,2,2.0
3,C_ID_a97720321f,10,6.0,2,6.0,1.166667,2,1,1.0
4,C_ID_fb0875cd28,4,3.5,3,3.5,1.428571,2,1,1.0


In [304]:
month_lag_feature.to_csv('data/new_month_lag_feature.csv', index=None)