# 特征工程 - Part2
在Part1的基础上，增加新的特征，最后得到296维特征


In [1]:
import gc
import math
import time
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import roc_auc_score

train = pd.read_csv('./music_val/train_val_svd.csv')
test = pd.read_csv('./music_val/test_val_svd.csv')
members = pd.read_csv('./music_val/members_val_svd.csv')
songs = pd.read_csv('./music_val/songs_val_svd.csv')

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

In [2]:
train.shape + test.shape + members.shape + songs.shape

(5901935, 11, 1475483, 11, 30755, 120, 359966, 99)

## 2. 统计用户的活跃度和歌曲的热度

In [None]:
# 训练集和测试集的数目作为索引
concat = train[['msno', 'song_id']].append(test[['msno', 'song_id']])
concat['timestamp'] = range(len(concat))

## windows_based count
window_sizes = [10, 25, 500, 5000, 10000, 50000]

msno_list = concat['msno'].values
song_list = concat['song_id'].values

def get_window_cnt(values, idx, window_size):
    lower = max(0, idx-window_size)
    upper = min(len(values), idx+window_size)
    return (values[lower:idx] == values[idx]).sum(), (values[idx:upper] == values[idx]).sum()

# 得到不同窗口的前后相同用户或者歌曲个数
for window_size in window_sizes:
    msno_before_cnt = np.zeros(len(concat))
    song_before_cnt = np.zeros(len(concat))
    msno_after_cnt = np.zeros(len(concat))
    song_after_cnt = np.zeros(len(concat))
    for i in range(len(concat)):
        msno_before_cnt[i], msno_after_cnt[i] = get_window_cnt(msno_list, i, window_size)
        song_before_cnt[i], song_after_cnt[i] = get_window_cnt(song_list, i, window_size)
    concat['msno_%d_before_cnt'%window_size] = msno_before_cnt
    concat['song_%d_before_cnt'%window_size] = song_before_cnt
    concat['msno_%d_after_cnt'%window_size] = msno_after_cnt
    concat['song_%d_after_cnt'%window_size] = song_after_cnt
    
    print('Window size for %d done.'%window_size)

# 统计当前用户和当前歌曲，相同的样本数
msno_dict = defaultdict(lambda: 0)
song_dict = defaultdict(lambda: 0)

msno_till_now_cnt = np.zeros(len(concat))
song_till_now_cnt = np.zeros(len(concat))
for i in range(len(concat)):
    msno_till_now_cnt[i] = msno_dict[msno_list[i]]
    msno_dict[msno_list[i]] += 1
    
    song_till_now_cnt[i] = song_dict[song_list[i]]
    song_dict[song_list[i]] += 1

concat['msno_till_now_cnt'] = msno_till_now_cnt
concat['song_till_now_cnt'] = song_till_now_cnt

print('Till-now count done.')

## 3. 添加新特征，数据集中每个样本所处的相对时间点

In [None]:
time.asctime(time.localtime(1484236800.0))

In [None]:
time.asctime(time.localtime(1471190400.0))

In [None]:
# 计算当前记录所处的相对时间点
def timestamp_map(x):
    x = (x - 0.0) / (7377417.0 - 0.0) * (1484236800.0 - 1471190400.0) + 1471190400.0
    return x
    
concat['timestamp'] = concat['timestamp'].apply(timestamp_map)

# 增加新特征，每个用户收听歌曲的时间点的平均值
msno_mean = concat.groupby(by='msno').mean()['timestamp'].to_dict()
members['msno_timestamp_mean'] = members['msno'].apply(lambda x: msno_mean[x])

# 增加新特征，每个用户收听歌曲的时间点的标准差。值越大说明用户收听的跨度越大
msno_std = concat.groupby(by='msno').std()['timestamp'].to_dict()
members['msno_timestamp_std'] = members['msno'].apply(lambda x: msno_std[x])

# 增加新特征，每首歌被收听的时间点的平均值
song_mean = concat.groupby(by='song_id').mean()['timestamp'].to_dict()
songs['song_timestamp_mean'] = songs['song_id'].apply(lambda x: song_mean[x])

# 增加新特征，每首歌被收听时间点的标准差。值越大，说明歌曲被收听的时间跨度大
song_std = concat.groupby(by='song_id').std()['timestamp'].to_dict()
songs['song_timestamp_std'] = songs['song_id'].apply(lambda x: song_std[x])

print('Timestamp done.')

features = ['msno_till_now_cnt', 'song_till_now_cnt']
for window_size in window_sizes:
    features += ['msno_%d_before_cnt'%window_size, 'song_%d_before_cnt'%window_size, \
            'msno_%d_after_cnt'%window_size, 'song_%d_after_cnt'%window_size]
for feat in features:
    concat[feat] = np.log1p(concat[feat])

# 得到增加特征后的训练集和测试集
features = ['timestamp'] + features
data = concat[features].values

for i in range(len(features)):
    train[features[i]] = data[:len(train), i]
    test[features[i]] = data[len(train):, i]
    
# 保存中间结果
train.to_csv('./music_val/train_svd_timestamp.csv', index=False)
test.to_csv('./music_val/test_svd_timestamp.csv', index=False)
members.to_csv('./music_val/members_svd_timestamp.csv', index=False)
songs.to_csv('./music_val/songs_svd_timestamp.csv', index=False)


In [None]:
train.shape + test.shape + members.shape + songs.shape

## 4. 填补歌曲缺失值

In [None]:
import gc
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score

train = pd.read_csv('./music_val/train_svd_timestamp.csv')
test = pd.read_csv('./music_val/test_svd_timestamp.csv')
members = pd.read_csv('./music_val/members_svd_timestamp.csv')
songs = pd.read_csv('./music_val/songs_svd_timestamp.csv')

In [None]:
columns = ['composer', 'lyricist', 'language', 'first_genre_id', 'second_genre_id', 'third_genre_id','isrc_missing']
for col in columns:
    songs[col].fillna(0, inplace=True)
    songs[col] = songs[col].astype(int)
songs['artist_name'].fillna(np.max(songs['artist_name'])+1, inplace=True)
songs['artist_name'] = songs['artist_name'].astype(int)

songs['song_id_missing'] = np.isnan(songs['song_length'].values) * 1

columns = ['song_length', 'genre_id_cnt', 'artist_song_cnt', 'composer_song_cnt', \
       'lyricist_song_cnt', 'genre_song_cnt', 'song_rec_cnt', \
       'artist_rec_cnt', 'composer_rec_cnt', 'lyricist_rec_cnt', \
       'genre_rec_cnt','cn','xxx','year', 'cn_song_cnt', \
       'xxx_song_cnt', 'year_song_cnt', 'cn_rec_cnt', 'xxx_rec_cnt', \
       'year_rec_cnt', 'song_timestamp_std', 'artist_cnt', 'lyricist_cnt', \
       'composer_cnt', 'is_featured'] + ['artist_component_%d'%i for i in range(16)]
for col in columns:
    songs[col].fillna(np.nanmean(songs[col]), inplace=True)


# 生成最终的歌曲信息表
songs.to_csv('./music_val/songs_val_nn.csv', index=False)

## 5. 增加用户特征

In [None]:
# members_origin = pd.read_csv('../source/members.csv')
# members['registration_init_time'] = members_origin['registration_init_time'].values
# members['expiration_date'] = members_origin['registration_init_time'].values

# members['registration_init_time'] = members['registration_init_time'].apply(lambda x: \
#         time.mktime(time.strptime(str(x),'%Y%m%d')))
# members['expiration_date'] = members['expiration_date'].apply(lambda x: \
#         time.mktime(time.strptime(str(x),'%Y%m%d')))

In [None]:
members['msno_timestamp_std'].fillna(np.nanmin(members['msno_timestamp_std']), inplace=True)

concat = train[['msno', 'song_id', 'source_system_tab', 'source_screen_name', \
        'source_type']].append(test[['msno', 'song_id', 'source_system_tab', \
        'source_screen_name', 'source_type']])
concat = concat.merge(songs[['song_id', 'song_length', 'artist_name', 'first_genre_id', \
        'artist_rec_cnt', 'song_rec_cnt', 'artist_song_cnt', 'xxx', 'year', \
        'language']], on='song_id', how='left')

concat['source'] = concat['source_system_tab'] * 10000 + concat['source_screen_name'] * 100 + \
        concat['source_type']
from sklearn.preprocessing import LabelEncoder
concat['source'] = LabelEncoder().fit_transform(concat['source'].values)

# member features
# 计算msno所属的song_length等数据的均值
# song_length mean: 某个用户听过的所有歌曲长度的平均值
# artist_song_cnt：某个用户听过的所有歌手活跃度的均值，活跃度指的是唱过的歌曲数目
# artist_rec_cnt：某个用户听过的所有歌手的热度的均值，热度指的是一个歌手被多少用户收听过
# song_rec_cnt： 某个用户听过的所有歌曲的热度的均值，热度指的是一个歌曲被多少用户收听过

mem_add = pd.DataFrame({'msno': range(concat['msno'].max()+1)})
data_avg = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'year']].groupby('msno').mean()
data_avg.columns = ['msno_'+i+'_mean' for i in data_avg.columns]
# data_avg['msno'] = data_avg.index.values
data_avg.reset_index()
members = members.merge(data_avg, on='msno', how='left')

#计算msno所属的song_length等数据的标准差
# artist_song_cnt 方差小代表了用户的喜欢的歌手的活跃度很单一，方差大代表了用户喜欢分布很广
# artist_rec_cnt 
# song_rec_cnt
# year

data_std = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'year']].groupby('msno').std()
data_std.columns = ['msno_'+i+'_std' for i in data_std.columns]
# data_std['msno'] = data_std.index.values
data_std.reset_index()
members = members.merge(data_std, on='msno', how='left')

#计算msno所属的不同artist_name的个数
按msno分类，统计每个用户听过的歌手个数
artist_msno = concat[['msno', 'artist_name']].groupby('msno').apply(lambda x: len(set(x['artist_name'].values)))
mem_add['artist_msno_cnt'] = artist_msno
mem_add['artist_msno_cnt'] = np.log1p(mem_add['artist_msno_cnt'])

#计算用户所属的language中不同取值的概率
language_dummy = pd.get_dummies(concat['language'])
language_dummy['msno'] = concat['msno'].values
language_prob = language_dummy.groupby('msno').mean()
language_prob.columns = ['msno_language_%d'%i for i in language_prob.columns]
# language_prob['msno'] = language_prob.index
language_prob.reset_index()
members = members.merge(language_prob, on='msno', how='left')

# 生成最终用户信息表
members.to_csv('./music_val/members_val_nn.csv', index=False)

# 7. 增加交叉性特征

In [None]:
# 一些歌曲属性
col = ['artist_name', 'first_genre_id', 'xxx', 'language', 'year', 'source']

# 统计msno-artist_name， msno-first_genre_id, msno-xxx,msno-language, msno-year, msno-source数据对在数据集中的出现次数
for feat in col:
    concat['id'] = concat['msno'] * 100000 + concat[feat]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['msno_'+feat+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

# 统计这些数据对在每个用户收听过的总歌曲里的概率，即出现次数/每个用户收听过的歌曲总数
msno_cnt = concat[['msno', 'song_id']].groupby('msno').count().to_dict()['song_id']
concat['msno_cnt'] = concat['msno'].apply(lambda x: msno_cnt[x])
for feat in col:
    concat['msno_'+feat+'_prob'] = concat['msno_'+feat+'_cnt'] / concat['msno_cnt']

# 统计song_id-source_system_tab，song_id-source_screen_name, song_id-source_type数据对在数据集中的出现次数
cols = ['source_system_tab', 'source_screen_name', 'source_type']
for col in cols:
    concat['id'] = concat['song_id'] * 10000 + concat[col]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['song_'+col+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

# 统计每首歌曲被收听的用户总数
song_cnt = concat[['msno', 'song_id']].groupby('song_id').count().to_dict()['msno']
concat['song_cnt'] = concat['song_id'].apply(lambda x: song_cnt[x])

# 统计数据对在每首歌曲被收听的用户总数的概率，即出现次数/每首歌曲被收听的用户总数
for col in cols:
    concat['song_'+col+'_prob'] = concat['song_'+col+'_cnt'] / concat['song_cnt']

result = concat[['msno_artist_name_prob', 'msno_first_genre_id_prob', 'msno_xxx_prob', \
        'msno_language_prob', 'msno_year_prob', 'song_source_system_tab_prob', \
        'song_source_screen_name_prob', 'song_source_type_prob', 'source', 'msno_source_prob']]

result['source'] = result['source'].astype('category')

for col in result.columns:
    train[col] = result[col].values[:len(train)]
    test[col] = result[col].values[len(train):]

In [None]:
train['artist_embeddings_dot'].fillna(train['artist_embeddings_dot'].mean(), inplace=True)
test['artist_embeddings_dot'].fillna(test['artist_embeddings_dot'].mean(), inplace=True)
    
# 生成最后训练集和验证集/测试集
train.to_csv('./music_val/train_val_nn.csv', index=False)
test.to_csv('./music_val/test_val_nn.csv', index=False)   