# 特征工程-version2
在version1的基础上增加新的特征，针对全部训练集和测试集


In [2]:
import gc
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score

train = pd.read_csv('../final/train_svd.csv')
test = pd.read_csv('../final/test_svd.csv')
members = pd.read_csv('../final/members_svd.csv')
songs = pd.read_csv('../final/songs_svd.csv')

In [57]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

In [3]:
train.shape + test.shape + members.shape + songs.shape

(5901935, 11, 1475483, 11, 30755, 117, 359966, 99)

In [4]:
songs.isnull().sum()

song_id                 0
song_length            52
artist_name            52
composer               52
lyricist               52
                       ..
artist_component_11    52
artist_component_12    52
artist_component_13    52
artist_component_14    52
artist_component_15    52
Length: 99, dtype: int64

## 1. 把用户表的时间都处理成毫秒

In [6]:
members_old = pd.read_csv('../source/members.csv')

In [7]:
members['registration_init_time'] = members_old['registration_init_time'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))
members['expiration_date'] = members_old['expiration_date'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))

In [8]:
train.shape + test.shape + members.shape + songs.shape

(5901935, 11, 1475483, 11, 30755, 119, 359966, 99)

In [9]:
members[['registration_init_time','expiration_date']]

Unnamed: 0,registration_init_time,expiration_date
0,1.313770e+09,1.505837e+09
1,1.435421e+09,1.498061e+09
2,1.460304e+09,1.499789e+09
3,1.441469e+09,1.441555e+09
4,1.485360e+09,1.497283e+09
...,...,...
30750,1.473091e+09,1.507133e+09
30751,1.487779e+09,1.488038e+09
30752,1.310400e+09,1.505318e+09
30753,1.427731e+09,1.507219e+09


In [10]:
# 1484236800.0 - 1471190400.0) + 1471190400.0   1488211200
# members.loc[members['registration_init_time']==1471190400]
time.asctime(time.localtime(1484236800.0))

'Fri Jan 13 00:00:00 2017'

In [76]:
members['registration_init_time'].max()

1488211200.0

In [12]:
members.sort_values('registration_init_time', ascending=False)[0:2]

Unnamed: 0,msno,city,bd,gender,registered_via,bd_missing,msno_rec_cnt,membership_days,registration_year,registration_month,...,member_artist_component_8,member_artist_component_9,member_artist_component_10,member_artist_component_11,member_artist_component_12,member_artist_component_13,member_artist_component_14,member_artist_component_15,registration_init_time,expiration_date
87,87,4,27.0,0,3,1.0,5.986452,2141,2011,12,...,0.006971,-0.003312,0.005803,-0.018783,0.001348,0.005004,-0.004167,-0.007972,1488211000.0,1488470000.0
23699,23699,11,23.0,1,0,0.0,4.89784,728,2015,11,...,0.000785,0.014496,-0.001727,-0.006538,0.001144,0.003287,-0.004472,0.000482,1488211000.0,1490112000.0


## 2. 添加新特征，数据集中每个样本所处的相对时间点

In [None]:
# # varience
# def timestamp_map(x):
#     if x < 5901935:
#         x = (x - 0.0) / (5901935.0 - 0.0) * (1484236800.0 - 1471190400.0) + 1471190400.0
#     else:
#         x = (x - 5901935.0) / (7377417.0 - 5901935.0) * (1488211200.0 - 1484236800.0) + 1484236800.0

#     return x

In [14]:
concat = train[['msno', 'song_id']].append(test[['msno', 'song_id']])
concat['timestamp'] = range(len(concat))
 
def timestamp_map(x):
    x = x / 7377417.0 * (1484236800.0 - 1471190400.0) + 1471190400.0
    return x

concat['timestamp'] = concat['timestamp'].apply(timestamp_map)

# 统计每个用户听歌的时间平均值
msno_mean = concat.groupby(by='msno').mean()['timestamp'].to_dict()
members['msno_timestamp_mean'] = members['msno'].apply(lambda x: msno_mean[x])

# 统计每个用户听歌的时间方差值
msno_std = concat.groupby(by='msno').std()['timestamp'].to_dict()
members['msno_timestamp_std'] = members['msno'].apply(lambda x: msno_std[x])

song_mean = concat.groupby(by='song_id').mean()['timestamp'].to_dict()
songs['song_timestamp_mean'] = songs['song_id'].apply(lambda x: song_mean[x])

song_std = concat.groupby(by='song_id').std()['timestamp'].to_dict()
songs['song_timestamp_std'] = songs['song_id'].apply(lambda x: song_std[x])

print('Timestamp done.')

Timestamp done.


In [15]:
members.isnull().sum()

msno                            0
city                            0
bd                              0
gender                          0
registered_via                  0
                             ... 
member_artist_component_15      0
registration_init_time          0
expiration_date                 0
msno_timestamp_mean             0
msno_timestamp_std            932
Length: 121, dtype: int64

In [16]:
songs.isnull().sum()

song_id                     0
song_length                52
artist_name                52
composer                   52
lyricist                   52
                        ...  
artist_component_13        52
artist_component_14        52
artist_component_15        52
song_timestamp_mean         0
song_timestamp_std     166766
Length: 101, dtype: int64

## 3. 统计用户的活跃度和歌曲的热度

In [17]:
# concat = train[['msno', 'song_id']].append(test[['msno', 'song_id']])

# windows_based count
window_sizes = [10, 25, 500, 5000, 10000, 50000]

msno_list = concat['msno'].values
song_list = concat['song_id'].values

def get_window_cnt(values, idx, window_size):
    lower = max(0, idx-window_size)
    upper = min(len(values), idx+window_size)
    return (values[lower:idx] == values[idx]).sum(), (values[idx:upper] == values[idx]).sum()

for window_size in window_sizes:
    msno_before_cnt = np.zeros(len(concat))
    song_before_cnt = np.zeros(len(concat))
    msno_after_cnt = np.zeros(len(concat))
    song_after_cnt = np.zeros(len(concat))
    for i in range(len(concat)):
        msno_before_cnt[i], msno_after_cnt[i] = get_window_cnt(msno_list, i, window_size)
        song_before_cnt[i], song_after_cnt[i] = get_window_cnt(song_list, i, window_size)
    concat['msno_%d_before_cnt'%window_size] = msno_before_cnt
    concat['song_%d_before_cnt'%window_size] = song_before_cnt
    concat['msno_%d_after_cnt'%window_size] = msno_after_cnt
    concat['song_%d_after_cnt'%window_size] = song_after_cnt
    
    print('Window size for %d done.'%window_size)


Window size for 10 done.
Window size for 25 done.
Window size for 500 done.
Window size for 5000 done.
Window size for 10000 done.
Window size for 50000 done.


In [18]:
from collections import defaultdict
## till_now count
msno_dict = defaultdict(lambda: 0)
song_dict = defaultdict(lambda: 0)

msno_till_now_cnt = np.zeros(len(concat))
song_till_now_cnt = np.zeros(len(concat))
for i in range(len(concat)):
    msno_till_now_cnt[i] = msno_dict[msno_list[i]]
    msno_dict[msno_list[i]] += 1
    
    song_till_now_cnt[i] = song_dict[song_list[i]]
    song_dict[song_list[i]] += 1

concat['msno_till_now_cnt'] = msno_till_now_cnt
concat['song_till_now_cnt'] = song_till_now_cnt
print('Till-now count done.')

features = ['msno_till_now_cnt', 'song_till_now_cnt']
for window_size in window_sizes:
    features += ['msno_%d_before_cnt'%window_size, 'song_%d_before_cnt'%window_size, \
            'msno_%d_after_cnt'%window_size, 'song_%d_after_cnt'%window_size]

for feat in features:
    concat[feat] = np.log1p(concat[feat])

Till-now count done.


In [19]:
features = ['timestamp'] + features

# 得到增加特征后的训练集和测试集
data = concat[features].values
for i in range(len(features)):
    train[features[i]] = data[:len(train), i]
    test[features[i]] = data[len(train):, i]

## save to files
train.to_csv('../final/train_svd_timestamp.csv', index=False)
test.to_csv('../final/test_svd_timestamp.csv', index=False)
members.to_csv('../final/members_svd_timestamp.csv', index=False)
songs.to_csv('../final/songs_svd_timestamp.csv', index=False)

In [20]:
train.groupby('source_system_tab').count()

Unnamed: 0_level_0,msno,song_id,source_screen_name,source_type,target,msno_source_system_tab_prob,msno_source_screen_name_prob,msno_source_type_prob,song_embeddings_dot,artist_embeddings_dot,...,msno_5000_after_cnt,song_5000_after_cnt,msno_10000_before_cnt,song_10000_before_cnt,msno_10000_after_cnt,song_10000_after_cnt,msno_50000_before_cnt,song_50000_before_cnt,msno_50000_after_cnt,song_50000_after_cnt
source_system_tab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649979,...,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649991,1649991
1,131268,131268,131268,131268,131268,131268,131268,131268,131268,131265,...,131268,131268,131268,131268,131268,131268,131268,131268,131268,131268
2,162670,162670,162670,162670,162670,162670,162670,162670,162670,162668,...,162670,162670,162670,162670,162670,162670,162670,162670,162670,162670
3,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128733,...,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128791,3128791
4,4804,4804,4804,4804,4804,4804,4804,4804,4804,4804,...,4804,4804,4804,4804,4804,4804,4804,4804,4804,4804
5,357163,357163,357163,357163,357163,357163,357163,357163,357163,357155,...,357163,357163,357163,357163,357163,357163,357163,357163,357163,357163
6,465357,465357,465357,465357,465357,465357,465357,465357,465357,465342,...,465357,465357,465357,465357,465357,465357,465357,465357,465357,465357
7,1891,1891,1891,1891,1891,1891,1891,1891,1891,1891,...,1891,1891,1891,1891,1891,1891,1891,1891,1891,1891


## 5.

In [None]:
columns = ['composer', 'lyricist', 'language', 'first_genre_id', 'second_genre_id', 'third_genre_id']
for col in columns:
    song[col].fillna(0, inplace=True)
    song[col] = song[col].astype(int)
song['artist_name'].fillna(np.max(song['artist_name'])+1, inplace=True)
song['artist_name'] = song['artist_name'].astype(int)
song['isrc_missing'] = song['isrc_missing'].astype(int)
# song.to_csv('../songs_gbdt.csv', index=False)

member['msno_timestamp_std'].fillna(np.nanmin(member['msno_timestamp_std']), inplace=True)
member.to_csv('../training/members_nn.csv', index=False)

song['song_id_missing'] = np.isnan(song['song_length'].values) * 1

columns = ['song_length', 'genre_id_cnt', 'artist_song_cnt', 'composer_song_cnt', \
       'lyricist_song_cnt', 'genre_song_cnt', 'song_rec_cnt', \
       'artist_rec_cnt', 'composer_rec_cnt', 'lyricist_rec_cnt', \
       'genre_rec_cnt', 'yy', 'cc_song_cnt', \
       'xxx_song_cnt', 'yy_song_cnt', 'cc_rec_cnt', 'xxx_rec_cnt', \
       'yy_rec_cnt', 'song_timestamp_std', 'artist_cnt', 'lyricist_cnt', \
       'composer_cnt', 'is_featured'] + ['artist_component_%d'%i for i in range(16)]
for col in columns:
    song[col].fillna(np.nanmean(song[col]), inplace=True)

song.to_csv('../final/songs_nn.csv', index=False)

## 6.

In [None]:
concat = tr[['msno', 'song_id', 'source_system_tab', 'source_screen_name', \
        'source_type']].append(te[['msno', 'song_id', 'source_system_tab', \
        'source_screen_name', 'source_type']])
concat = concat.merge(song[['song_id', 'song_length', 'artist_name', 'first_genre_id', \
        'artist_rec_cnt', 'song_rec_cnt', 'artist_song_cnt', 'xxx', 'yy', \
        'language']], on='song_id', how='left')

concat['source'] = concat['source_system_tab'] * 10000 + concat['source_screen_name'] * 100 + \
        concat['source_type']
from sklearn.preprocessing import LabelEncoder
concat['source'] = LabelEncoder().fit_transform(concat['source'].values)

## member features
#计算msno所属的song_length等数据的均值
mem_add = pd.DataFrame({'msno': range(concat['msno'].max()+1)})
data_avg = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'yy']].groupby('msno').mean()
data_avg.columns = ['msno_'+i+'_mean' for i in data_avg.columns]
data_avg['msno'] = data_avg.index.values
mem_add = mem_add.merge(data_avg, on='msno', how='left')

#计算msno所属的song_length等数据的标准差
data_std = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'yy']].groupby('msno').std()
data_std.columns = ['msno_'+i+'_std' for i in data_std.columns]
data_std['msno'] = data_std.index.values
mem_add = mem_add.merge(data_std, on='msno', how='left')

#计算msno所属的不同artist_name的个数
artist_msno = concat[['msno', 'artist_name']].groupby('msno').apply(lambda x: len(set(x['artist_name'].values)))
mem_add['artist_msno_cnt'] = artist_msno
mem_add['artist_msno_cnt'] = np.log1p(mem_add['artist_msno_cnt'])

#计算用户所属的language中不同取值的概率
language_dummy = pd.get_dummies(concat['language'])
language_dummy['msno'] = concat['msno'].values
language_prob = language_dummy.groupby('msno').mean()
language_prob.columns = ['msno_language_%d'%i for i in language_prob.columns]
language_prob['msno'] = language_prob.index
mem_add = mem_add.merge(language_prob, on='msno', how='left')

mem_add.to_csv('../members_add.csv', index=False)

In [None]:
# train/test features
col = ['artist_name', 'first_genre_id', 'xxx', 'language', 'yy', 'source']
#统计msno-artist_name等数据对在数据集中的出现次数
for feat in col:
    concat['id'] = concat['msno'] * 100000 + concat[feat]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['msno_'+feat+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

#统计每个msno对应的歌曲个数，然后计算前面artist_name等特征在这些歌曲中的出现概率
msno_cnt = concat[['msno', 'song_id']].groupby('msno').count().to_dict()['song_id']
concat['msno_cnt'] = concat['msno'].apply(lambda x: msno_cnt[x])
for feat in col:
    concat['msno_'+feat+'_prob'] = concat['msno_'+feat+'_cnt'] / concat['msno_cnt']

#计算song_id-source_system_tab等数据对在数据集中的出现次数
cols = ['source_system_tab', 'source_screen_name', 'source_type']
for col in cols:
    concat['id'] = concat['song_id'] * 10000 + concat[col]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['song_'+col+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

#计算每个歌曲对应的用户个数
song_cnt = concat[['msno', 'song_id']].groupby('song_id').count().to_dict()['msno']
concat['song_cnt'] = concat['song_id'].apply(lambda x: song_cnt[x])

#计算source_system_tab等数据在这些用户中的出现概率
for col in cols:
    concat['song_'+col+'_prob'] = concat['song_'+col+'_cnt'] / concat['song_cnt']

result = concat[['msno_artist_name_prob', 'msno_first_genre_id_prob', 'msno_xxx_prob', \
        'msno_language_prob', 'msno_yy_prob', 'song_source_system_tab_prob', \
        'song_source_screen_name_prob', 'song_source_type_prob', 'source', 'msno_source_prob']]

result[:len(tr)].to_csv('../train_part_add.csv', index=False)
result[len(tr):].to_csv('../test_add.csv', index=False)

## 2. 整合各表，保存做完特征工程的数据集

In [1]:
import gc
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score

train = pd.read_csv('../final/train_svd_before_after.csv')
test = pd.read_csv('../final/test_svd_before_after.csv')
members = pd.read_csv('../final/members_svd_before_after.csv')
songs = pd.read_csv('../final/songs_svd_before_after.csv')

In [2]:
columns = ['composer', 'lyricist', 'language', 'first_genre_id', 'second_genre_id', 'third_genre_id']
for col in columns:
    songs[col].fillna(0, inplace=True)
    songs[col] = songs[col].astype(int)
songs['artist_name'].fillna((songs['artist_name'].max())+1, inplace=True)
songs['artist_name'] = songs['artist_name'].astype(int)
songs['isrc_missing'].fillna(0, inplace=True)
songs['isrc_missing'] = songs['isrc_missing'].astype(int)
# songs.to_csv('../training/songs_gbdt.csv', index=False)

songs['song_id_missing'] = np.isnan(songs['song_length'].values) * 1

columns = ['song_length', 'genre_id_cnt', 'artist_cnt', 'lyricist_cnt', 'composer_cnt', 'is_featured',\
           'artist_song_cnt', 'composer_song_cnt', 'lyricist_song_cnt', 'genre_song_cnt', 'song_rec_cnt', \
           'artist_rec_cnt', 'composer_rec_cnt', 'lyricist_rec_cnt', 'genre_rec_cnt', 'cn', 'xxx', 'year', \
           'cn_song_cnt', 'xxx_song_cnt', 'year_song_cnt', 'cn_rec_cnt', 'xxx_rec_cnt', 'year_rec_cnt', \
           'repeat_play_chance','plays'] + ['artist_component_%d'%i for i in range(16)]
for col in columns:
    songs[col].fillna(np.nanmean(songs[col]), inplace=True)

songs.to_csv('../final/songs_nn.csv', index=False)

In [3]:
# train.fillna(train['artist_embeddings_dot'].mean(), inplace=True)
# test.fillna(test['artist_embeddings_dot'].mean(), inplace=True)

In [4]:
# 保存数据集
train.to_csv('../final/train_nn.csv', index=False, float_format='%.6f')
test.to_csv('../final/test_nn.csv', index=False, float_format='%.6f')
members.to_csv('../final/members_nn.csv', index=False)