# 特征工程

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
import math
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

# 读取原始数据
train = pd.read_csv('../source/train.csv')
test = pd.read_csv('../source/test.csv')
members = pd.read_csv('../source/members.csv')
songs = pd.read_csv('../source/songs.csv')
song_extra = pd.read_csv('../source/song_extra_info.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

In [4]:
train.shape + test.shape + songs.shape + song_extra.shape + members.shape

(7377418, 6, 2556790, 6, 2296320, 7, 2295971, 3, 34403, 7)

## 1. 划分训练集和验证集

In [7]:
# 用训练集的后百分之二十数据作为验证集
test = train[math.ceil(train.shape[0] * 0.8):]
train = train[0:math.ceil(train.shape[0] * 0.8)]
test.to_csv('../training/validation.csv', index = False)
train.to_csv('../training/train.csv', index = False)

In [6]:
train.shape + test.shape

(5901935, 6, 1475483, 6)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
import math
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv('../training/train.csv')
test = pd.read_csv('../training/validation.csv')
members = pd.read_csv('../source/members.csv')
songs = pd.read_csv('../source/songs.csv')
song_extra = pd.read_csv('../source/song_extra_info.csv')

## 2. 数据清理

In [3]:
## 删除songs表中没有出现在训练和测试数据中的歌曲, 剩下419781首songs信息（原数据有2296320首歌曲）
song_id_set = set(train['song_id'].append(test['song_id']))
songs['appeared'] = songs['song_id'].apply(lambda x: True if x in song_id_set else False)
songs = songs[songs.appeared]
songs.drop('appeared', axis=1, inplace=True)

## 删除song_extra表中没有出现在训练和测试数据中的歌曲信息, 剩下419661条songs额外信息（原数据有2295971条信息）
song_extra['appeared'] = song_extra['song_id'].apply(lambda x: True if x in song_id_set else False)
song_extra = song_extra[song_extra.appeared]
song_extra.drop('appeared', axis=1, inplace=True)

## members表中的用户都在训练和测试集中出现，不需要删除
msno_set = set(train['msno'].append(test['msno']))
members['appeared'] = members['msno'].apply(lambda x: True if x in msno_set else False)
members = members[members.appeared]
members.drop('appeared', axis=1, inplace=True)

# 把年龄异常的数据统一归为一类，并作为一个新特征加入
members['bd'] = members['bd'].apply(lambda x: np.nan if x<=0 or x >=80 else x)
members['bd_missing'] = (members['bd'].isnull()) * 1.0

## 3. 缺失值处理

In [4]:
# 填补songs表中缺失值
songs['genre_ids'].fillna('0', inplace=True)
songs['artist_name'].fillna('no_artist_name', inplace=True)
songs['composer'].fillna('no_composer', inplace=True)
songs['lyricist'].fillna('no_lyricist', inplace=True)

# 获取language缺失的唯一一条记录的演唱者，找到演唱者其他歌曲的语言，来填充这个缺失值
songs_temp = songs.loc[songs['language'].isnull()]
language_value = (songs.loc[songs['artist_name'] == (songs_temp['artist_name'].values)[0]]['language'][0:1].values)[0]
songs['language'].fillna(language_value, inplace=True)

# 填补members表中缺失值
members['bd'].fillna(members['bd'].median(), inplace=True)
members['gender'].fillna('Other', inplace=True)

# 填补训练集中缺失值
train['source_system_tab'].fillna(train['source_system_tab'].mode()[0], inplace=True)
train['source_screen_name'].fillna(train['source_screen_name'].mode()[0], inplace=True)
train['source_type'].fillna(train['source_type'].mode()[0], inplace=True)

# 填补验证集中缺失值
test['source_system_tab'].fillna(test['source_system_tab'].mode()[0], inplace=True)
test['source_screen_name'].fillna(test['source_screen_name'].mode()[0], inplace=True)
test['source_type'].fillna(test['source_type'].mode()[0], inplace=True)

## 4. LabelEncodor编码

In [5]:
## 对所有的msno做labelEncoder编码
msno_encoder = LabelEncoder()
msno_encoder.fit(members['msno'].values)
members['msno'] = msno_encoder.transform(members['msno'])
train['msno'] = msno_encoder.transform(train['msno'])
test['msno'] = msno_encoder.transform(test['msno'])

In [6]:
# 对所有的song_id做labelEncoder编码
song_id_encoder = LabelEncoder()
song_id_encoder.fit(train['song_id'].append(test['song_id']))
songs['song_id'] = song_id_encoder.transform(songs['song_id'])
song_extra['song_id'] = song_id_encoder.transform(song_extra['song_id'])
train['song_id'] = song_id_encoder.transform(train['song_id'])
test['song_id'] = song_id_encoder.transform(test['song_id'])

In [7]:
# 对train和test中其他类别型变量做labelEncoder编码
columns = ['source_system_tab', 'source_screen_name', 'source_type']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(train[column].append(test[column]))
    train[column] = column_encoder.transform(train[column])
    test[column] = column_encoder.transform(test[column])

In [8]:
# 对members中的city、gender、registered_via做labelEncoder编码
columns = ['city', 'gender', 'registered_via']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(members[column])
    members[column] = column_encoder.transform(members[column])

## 5. 处理songs表中字段

In [9]:
# 处理genre_ids, 分割成  first_genre_id
#                      second_genre_id
#                      third_genre_id
# 统计一首歌出现的流派数目 genre_id_cnt

genre_id = np.zeros((len(songs), 4))
for i in range(len(songs)):
    ids = str(songs['genre_ids'].values[i]).split('|')
    if len(ids) > 2:
        genre_id[i, 0] = int(ids[0])
        genre_id[i, 1] = int(ids[1])
        genre_id[i, 2] = int(ids[2])
    elif len(ids) > 1:
        genre_id[i, 0] = int(ids[0])
        genre_id[i, 1] = int(ids[1])
    elif len(ids) == 1:
        genre_id[i, 0] = int(ids[0])
    genre_id[i, 3] = len(ids)
songs['first_genre_id'] = genre_id[:, 0]
songs['second_genre_id'] = genre_id[:, 1]
songs['third_genre_id'] = genre_id[:, 2]
songs['genre_id_cnt'] = genre_id[:, 3]

# 对提取的三个流派做labelEncoder
genre_encoder = LabelEncoder()
genre_encoder.fit((songs.first_genre_id.append(songs.second_genre_id)).append(songs.third_genre_id))
songs['first_genre_id'] = genre_encoder.transform(songs['first_genre_id'])
songs['second_genre_id'] = genre_encoder.transform(songs['second_genre_id'])
songs['third_genre_id'] = genre_encoder.transform(songs['third_genre_id'])
songs.drop('genre_ids', axis=1, inplace=True)

In [10]:
# 统计每首歌的歌手数目
def artist_count(x):
    return x.count('and') + x.count(',') + x.count(' feat') + x.count('&') + 1
songs['artist_cnt'] = songs['artist_name'].apply(artist_count).astype(np.int64)

def get_count(x):
    try:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    except:
        return 0
# 统计每首歌的作词人数目
songs['lyricist_cnt'] = songs['lyricist'].apply(get_count).astype(np.int64)
# 统计每首歌的作曲人数目
songs['composer_cnt'] = songs['composer'].apply(get_count).astype(np.int64)
# 统计每首歌是独唱还是合唱
songs['is_featured'] = songs['artist_name'].apply(lambda x: 1 if ' feat' in str(x) else 0).astype(np.int64)

In [11]:
# 歌曲的演唱者只保留第一个
def get_first_artist(x):
    if x.count('and') > 0:
        x = x.split('and')[0]
    if x.count(',') > 0:
        x = x.split(',')[0]
    if x.count(' feat') > 0:
        x = x.split(' feat')[0]
    if x.count('&') > 0:
        x = x.split('&')[0]
    return x.strip()
songs['artist_name'] = songs['artist_name'].apply(get_first_artist)

In [12]:
# 歌曲的作词人和作曲人都只保留第一个
def get_first_term(x):
    try:
        if x.count('|') > 0:
            x = x.split('|')[0]
        if x.count('/') > 0:
            x = x.split('/')[0]
        if x.count('\\') > 0:
            x = x.split('\\')[0]
        if x.count(';') > 0:
            x = x.split(';')[0]
        return x.strip()
    except:
        return x

songs['lyricist'] = songs['lyricist'].apply(get_first_term)
songs['composer'] = songs['composer'].apply(get_first_term)

In [13]:
# 对处理后的artist_name、composer、lyricist、language做LabelEncoder        
columns = ['artist_name', 'lyricist', 'composer','language']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(songs[column])
    songs[column] = column_encoder.transform(songs[column])

In [14]:
# 按演唱者分类，统计每个演唱者唱过几首歌
artist_song_cnt = songs.groupby(by='artist_name').count()['song_id'].to_dict()
songs['artist_song_cnt'] = songs['artist_name'].apply(lambda x: artist_song_cnt[x] if not np.isnan(x) else np.nan)

# 按作曲人分类，统计每个作曲人编曲数目
composer_song_cnt = songs.groupby(by='composer').count()['song_id'].to_dict()
songs['composer_song_cnt'] = songs['composer'].apply(lambda x: composer_song_cnt[x] if not np.isnan(x) else np.nan)

# 按作词人分类，统计每个作词人作词数目
lyricist_song_cnt = songs.groupby(by='lyricist').count()['song_id'].to_dict()
songs['lyricist_song_cnt'] = songs['lyricist'].apply(lambda x: lyricist_song_cnt[x] if not np.isnan(x) else np.nan)

# 按歌曲风格分类，统计每个流派包含多少首歌
genre_song_cnt = songs.groupby(by='first_genre_id').count()['song_id'].to_dict()
songs['genre_song_cnt'] = songs['first_genre_id'].apply(lambda x: genre_song_cnt[x] if not np.isnan(x) else np.nan)

经过处理后的songs增加以下四个新特征：
artist_song_cnt：  统计每个演唱者唱过几首歌

composer_song_cnt：统计每个作曲人编曲数目

lyricist_song_cnt：统计每个作词人作词数目

genre_song_cnt：   统计每个风格包含多少首歌


## 6. 处理song_extra中字段

In [15]:
# 歌曲出版的国家码 -- cn
# 歌曲出版者码    -- xxx
# 歌曲录制年份    -- year 
data = train[['msno', 'song_id']].append(test[['msno', 'song_id']])
songs = songs.merge(song_extra, on='song_id', how='left')

isrc = songs['isrc']
songs['cn'] = isrc.str.slice(0, 2)
songs['xxx'] = isrc.str.slice(2, 5)
songs['year'] = isrc.str.slice(5, 7).astype(float)
# 歌曲录制的年份转换为4位
songs['year'] = songs['year'].apply(lambda x: 2000+x if x < 18 else 1900+x)

# 增加一个新特征，代表是否isrc缺失
songs['isrc_missing'] = (songs['cn'].isnull()) * 1.0
songs['cn'] = LabelEncoder().fit_transform(songs['cn'].fillna('None'))
songs['xxx'] = LabelEncoder().fit_transform(songs['xxx'].fillna('None'))
songs['year'].fillna(songs['year'].median(),inplace=True)

# 按国家码分类，统计每个国家码的歌曲数目
song_cn_cnt = songs.groupby(by='cn').count()['song_id'].to_dict()
songs['cn_song_cnt'] = songs['cn'].apply(lambda x: song_cn_cnt[x] if not np.isnan(x) else None)

# 按出版码分类，统计每个出版者出版的歌曲数目
song_xxx_cnt = songs.groupby(by='xxx').count()['song_id'].to_dict()
songs['xxx_song_cnt'] = songs['xxx'].apply(lambda x: song_xxx_cnt[x] if not np.isnan(x) else None)

# 按歌曲年份分类，统计每个年份录制的歌曲数目
song_year_cnt = songs.groupby(by='year').count()['song_id'].to_dict()
songs['year_song_cnt'] = songs['year'].apply(lambda x: song_year_cnt[x] if not np.isnan(x) else None)

data = data.merge(songs, on='song_id', how='left')
# 按国家码分类，统计每个国家码下有多少用户收听
song_cn_cnt = data.groupby(by='cn').count()['msno'].to_dict()
songs['cn_rec_cnt'] = songs['cn'].apply(lambda x: song_cn_cnt[x] if not np.isnan(x) else None)

# 按出版码分类，统计每个出版码下有多少用户收听
song_xxx_cnt = data.groupby(by='xxx').count()['msno'].to_dict()
songs['xxx_rec_cnt'] = songs['xxx'].apply(lambda x: song_xxx_cnt[x] if not np.isnan(x) else None)

# 按歌曲年份分类，统计每个年份下有多少用户收听
song_year_cnt = data.groupby(by='year').count()['msno'].to_dict()
songs['year_rec_cnt'] = songs['year'].apply(lambda x: song_year_cnt[x] if not np.isnan(x) else None)

features = ['cn_song_cnt', 'xxx_song_cnt', 'year_song_cnt', 'cn_rec_cnt', 'xxx_rec_cnt', 'year_rec_cnt']
for feat in features:
    songs[feat] = np.log1p(songs[feat])

songs.drop(['name', 'isrc'], axis=1, inplace=True)

In [16]:
# 按歌曲id分类，统计每首歌被播放的用户数目
song_rec_cnt = data.groupby(by='song_id').count()['msno'].to_dict()
songs['song_rec_cnt'] = songs['song_id'].apply(lambda x: song_rec_cnt[x] if not np.isnan(x) else np.nan)

# 按演唱者分类，统计每个演唱者被收听过的用户数目
artist_rec_cnt = data.groupby(by='artist_name').count()['msno'].to_dict()
songs['artist_rec_cnt'] = songs['artist_name'].apply(lambda x: artist_rec_cnt[x] if not np.isnan(x) else np.nan)

# 按作曲人分类，统计每个作曲人被收听过的用户数目
composer_rec_cnt = data.groupby(by='composer').count()['msno'].to_dict()
songs['composer_rec_cnt'] = songs['composer'].apply(lambda x: composer_rec_cnt[x] if not np.isnan(x) else np.nan)

# 按作词人分类，统计每个作词人被收听的用户数目
lyricist_rec_cnt = data.groupby(by='lyricist').count()['msno'].to_dict()
songs['lyricist_rec_cnt'] = songs['lyricist'].apply(lambda x: lyricist_rec_cnt[x] if not np.isnan(x) else np.nan)

# 按first_genre_id流派分类，统计每个风格被收听的用户数目
genre_rec_cnt = data.groupby(by='first_genre_id').count()['msno'].to_dict()
songs['genre_rec_cnt'] = songs['first_genre_id'].apply(lambda x: genre_rec_cnt[x] if not np.isnan(x) else np.nan)


In [17]:
# 对以下特征做log变换
features = ['song_length', 'song_rec_cnt', 'artist_song_cnt', 'composer_song_cnt', \
        'lyricist_song_cnt', 'genre_song_cnt', 'artist_rec_cnt', \
        'composer_rec_cnt', 'lyricist_rec_cnt', 'genre_rec_cnt']
for feat in features:
    songs[feat] = np.log1p(songs[feat])

In [18]:
# count：训练集里每首歌曲被听取的次数，即该歌曲在数据集中出现的总次数
# mean：被重复听取的概率（sum(target=1)/count，即一歌曲被用户第二次播放/该歌曲在数据集中出现的总次数
train_merge_songs = train[['msno', 'song_id','target']].append(test[['msno', 'song_id','target']]).merge(songs,on='song_id')
song_mean_count = train_merge_songs[['song_id', 'target']].groupby(['song_id']).agg(['mean', 'count'])
song_mean_count.reset_index(inplace=True)
song_mean_count.columns = list(map(''.join, song_mean_count.columns.values))
song_mean_count.columns = ['song_id', 'repeat_play_chance', 'plays']  #rename columns
songs = songs.merge(song_mean_count, on='song_id',how='right') # merge song data with computed values

## 7. 处理members中字段

In [19]:
data = train[['msno', 'song_id']].append(test[['msno', 'song_id']])

# 统计一个用户听过多少首歌(训练集+测试集)
mem_rec_cnt = data.groupby(by='msno').count()['song_id'].to_dict()
members['msno_rec_cnt'] = members['msno'].apply(lambda x: mem_rec_cnt[x])

## log1p变换
features = ['msno_rec_cnt']
for feat in features:
    members[feat] = np.log1p(members[feat])

In [20]:
# 到期时间减去注册时间，得到用户的会员时间，作为新特征加入members
members['expiration_date'] = members['expiration_date'].astype(np.str)
members['registration_init_time'] = members['registration_init_time'].astype(np.str)
members['membership_days'] = pd.to_datetime(members['expiration_date']).subtract(pd.to_datetime(members['registration_init_time'])).dt.days.astype(int)

# 将registration_init_time拆分成年、月、日，并作为新特征加入到menmbers中
members['registration_year'] = pd.to_datetime(members['registration_init_time']).dt.year
members['registration_month'] = pd.to_datetime(members['registration_init_time']).dt.month
members['registration_day'] = pd.to_datetime(members['registration_init_time']).dt.day

# 将expiration_date拆分成年、月、日，并作为新特征加入到members中
members['expiration_year'] = pd.to_datetime(members['expiration_date']).dt.year
members['expiration_month'] = pd.to_datetime(members['expiration_date']).dt.month
members['expiration_day'] = pd.to_datetime(members['expiration_date']).dt.day

members['registration_init_time'] = members['registration_init_time'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))
members['expiration_date'] = members['expiration_date'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))
members = members.drop(['registration_init_time','expiration_date'], axis=1)


In [21]:
# 对train和test中的相关source的三个字段做独热编码, 并计算相关概率
dummy_feat = ['source_system_tab', 'source_screen_name', 'source_type']
concat_train_test = train.drop('target', axis=1).append(test.drop('target', axis=1))

for feat in dummy_feat:
    dummies = pd.get_dummies(concat_train_test[feat])
    dummies.columns = ['msno_%s_'%feat + '%s'%col for col in dummies.columns]
    dummies['msno'] = concat_train_test['msno'].values
    dummies = dummies.groupby('msno').mean()
    dummies.reset_index()
    members = members.merge(dummies, on='msno', how='left')

In [22]:
# 保存中间过程文件， 方便调试
train.to_csv('../training/train_validation_r1.csv', index=False)
test.to_csv('../training/test_validaton_r1.csv', index=False)
members.to_csv('../training/members_validation_r1.csv', index=False)
songs.to_csv('../training/songs_validation_r1.csv', index=False)

## 8. 增加后验概率

In [23]:
# 把用户的所有特征合并到训练集和验证集中
train_temp = train.merge(members, on='msno', how='left')
test_temp = test.merge(members, on='msno', how='left')

# 查找训练集和验证集中包含‘source_system_tab’字符的特征
train_source_system_tab = train_temp[[col for col in train_temp.columns if 'source_system_tab' in col]]
test_source_system_tab =  test_temp[[col for col in test_temp.columns if 'source_system_tab' in col]]

# 查找训练集和验证集中包含‘source_screen_name’字符的特征
train_source_screen_name = train_temp[[col for col in train_temp.columns if 'source_screen_name' in col]]
test_source_screen_name =  test_temp[[col for col in test_temp.columns if 'source_screen_name' in col]]

# 查找训练集和验证集中包含‘source_type’字符的特征
train_source_type = train_temp[[col for col in train_temp.columns if 'source_type' in col]]
test_source_type =  test_temp[[col for col in test_temp.columns if 'source_type' in col]]

In [24]:
train['msno_source_system_tab_prob'] = train_source_system_tab.apply(lambda x: \
                                       x['msno_source_system_tab_%d'%x['source_system_tab']], axis=1)
test['msno_source_system_tab_prob'] = test_source_system_tab.apply(lambda x: \
                                      x['msno_source_system_tab_%d'%x['source_system_tab']], axis=1)

train['msno_source_screen_name_prob'] = train_source_screen_name.apply(lambda x: \
        x['msno_source_screen_name_%d'%x['source_screen_name']], axis=1)
test['msno_source_screen_name_prob'] = test_source_screen_name.apply(lambda x: \
        x['msno_source_screen_name_%d'%x['source_screen_name']], axis=1)
                                                                       
train['msno_source_type_prob'] = train_source_type.apply(lambda x: \
        x['msno_source_type_%d'%x['source_type']], axis=1)
test['msno_source_type_prob'] = test_source_type.apply(lambda x: \
        x['msno_source_type_%d'%x['source_type']], axis=1)

In [26]:
train.isnull().sum()

msno                            0
song_id                         0
source_system_tab               0
source_screen_name              0
source_type                     0
target                          0
msno_source_system_tab_prob     0
msno_source_screen_name_prob    0
msno_source_type_prob           0
dtype: int64

In [27]:
# 保存包含后验概率的文件
train.to_csv('../training/train_validation_prod.csv', index=False)
test.to_csv('../training/test_validation_prob.csv', index=False)
members.to_csv('../training/members_validation_prob.csv', index=False)
songs.to_csv('../training/songs_validation_prob.csv', index=False)

## 9. 用户-歌曲关系矩阵分解，用户-歌手关系矩阵分解

In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
import math
from scipy import sparse
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv('../training/train_validation_prod.csv')
test = pd.read_csv('../training/test_validation_prob.csv')
members = pd.read_csv('../training/members_validation_prob.csv')
songs = pd.read_csv('../training/songs_validation_prob.csv')

In [29]:
concat = train[['msno', 'song_id']].append(test[['msno', 'song_id']])
member_cnt = concat['msno'].max() + 1
song_cnt = concat['song_id'].max() + 1
artist_cnt = int(songs['artist_name'].max() + 1)
print(len(concat))

7377418


In [30]:
# 设计用户-歌曲关系矩阵，并做SVD分解成三个矩阵
n_component = 48

data = np.ones(len(concat))
msno = concat['msno'].values
song_id = concat['song_id'].values

rating = sparse.coo_matrix((data, (msno, song_id)))
rating = (rating > 0) * 1.0

[u, s, vt] = svds(rating, k=n_component)
print(s[::-1])
s_song = np.diag(s[::-1])

# 保留跟用户相关的48维特征，存入members表中
members_topics = pd.DataFrame(u[:, ::-1])
members_topics.columns = ['member_component_%d'%i for i in range(n_component)]
members_topics['msno'] = range(member_cnt)
members = members.merge(members_topics, on='msno', how='right')

# 保留跟歌曲相关的48维特征，存入songs表中
song_topics = pd.DataFrame(vt.transpose()[:, ::-1])
song_topics.columns = ['song_component_%d'%i for i in range(n_component)]
song_topics['song_id'] = range(song_cnt)
songs = songs.merge(song_topics, on='song_id', how='right')

# 设计用户-歌曲关系矩阵，并做SVD分解成三个矩阵 
n_component = 16

concat = concat.merge(songs[['song_id', 'artist_name']], on='song_id', how='left')
concat = concat[concat['artist_name'] >= 0]
msno = concat['msno'].values
artist = concat['artist_name'].values.astype(int)

print(len(concat))
data = np.ones(len(concat))
rating_tmp = sparse.coo_matrix((data, (msno, artist)))

rating = np.log1p(rating_tmp) * 0.3 + (rating_tmp > 0) * 1.0

[u, s, vt] = svds(rating, k=n_component)
print(s[::-1])
s_artist = np.diag(s[::-1])

# 保留跟用户相关的16维特征，存入members表中
members_topics = pd.DataFrame(u[:, ::-1])
members_topics.columns = ['member_artist_component_%d'%i for i in range(n_component)]
members_topics['msno'] = range(member_cnt)
members = members.merge(members_topics, on='msno', how='left')

# 保留跟artist name相关的16维特征，存入songs表中
artist_topics = pd.DataFrame(vt.transpose()[:, ::-1])
artist_topics.columns = ['artist_component_%d'%i for i in range(n_component)]
artist_topics['artist_name'] = range(artist_cnt)
songs = songs.merge(artist_topics, on='artist_name', how='left')

[929.19644441 348.80981297 309.40275499 252.8020833  235.66103789
 184.89734987 178.8854274  163.36788383 158.45152867 151.20234554
 147.54990949 146.69235095 141.14892669 132.07849752 129.90593115
 125.32091035 122.28441182 119.65531332 117.63870037 115.73739365
 113.58651385 112.08390834 111.2969255  109.59026882 108.62116088
 106.05728495 105.57023407 102.20677814 101.28490626 100.46536012
  97.84094794  96.92911592  96.04246875  95.87283229  93.84138743
  92.73973012  91.85405027  90.3989575   89.84799528  89.29548723
  88.31527645  88.11258867  86.453424    86.01874769  84.99338776
  84.59675956  83.97524489  83.40244589]
7377304
[1231.90585798  405.63937345  303.7062179   278.05664389  243.94539539
  214.25753678  175.36567621  172.69229575  157.80083028  153.16554079
  149.3448921   146.934055    138.7035043   135.44997276  132.95733285
  126.14460143]


In [31]:
# 增加新特征
members = members.sort_values(by='msno')
songs = songs.sort_values(by='song_id')

mem_cols = ['member_component_%d'%i for i in range(48)]
song_cols = ['song_component_%d'%i for i in range(48)]

member_embeddings = members[mem_cols].values
song_embeddings = songs[song_cols].values

mem_cols = ['member_artist_component_%d'%i for i in range(16)]
song_cols = ['artist_component_%d'%i for i in range(16)]

member_artist_embeddings = members[mem_cols].values
song_artist_embeddings = songs[song_cols].values

train_dot = np.zeros((len(train), 2))
test_dot = np.zeros((len(test), 2))

for i in range(len(train)):
    msno_idx = train['msno'].values[i]
    song_idx = train['song_id'].values[i]
    
    train_dot[i, 0] = np.dot(member_embeddings[msno_idx], np.dot(s_song, song_embeddings[song_idx]))
    train_dot[i, 1] = np.dot(member_artist_embeddings[msno_idx], np.dot(s_artist, song_artist_embeddings[song_idx]))

for i in range(len(test)):
    msno_idx = test['msno'].values[i]
    song_idx = test['song_id'].values[i]
    
    test_dot[i, 0] = np.dot(member_embeddings[msno_idx], np.dot(s_song, song_embeddings[song_idx]))
    test_dot[i, 1] = np.dot(member_artist_embeddings[msno_idx], np.dot(s_artist, song_artist_embeddings[song_idx]))

train['song_embeddings_dot'] = train_dot[:, 0]
train['artist_embeddings_dot'] = train_dot[:, 1]

test['song_embeddings_dot'] = test_dot[:, 0]
test['artist_embeddings_dot'] = test_dot[:, 1]

In [32]:
# 保存SVD结果的文件
train.to_csv('../training/train_validation_svd.csv', index=False)
test.to_csv('../training/test_validation_svd.csv', index=False)
members.to_csv('../training/members_validation_svd.csv', index=False)
songs.to_csv('../training/songs_validation_svd.csv', index=False)

## 10. 整合各表，保存做完特征工程的数据集

In [2]:
import os
import numpy as np
import pandas as pd

train = pd.read_csv('../training/train_validation_svd.csv')
test = pd.read_csv('../training/test_validation_svd.csv')
members = pd.read_csv('../training/members_validation_svd.csv')
songs = pd.read_csv('../training/songs_validation_svd.csv')

In [21]:
# 保存数据集
train.to_csv('../training/train_final.csv', index=False, float_format='%.6f')
test.to_csv('../training/test_final.csv', index=False, float_format='%.6f')
members.to_csv('../training/members_gbdt.csv', index=False)

In [9]:
columns = ['composer', 'lyricist', 'language', 'first_genre_id', 'second_genre_id', 'third_genre_id']
for col in columns:
    songs[col].fillna(0, inplace=True)
    songs[col] = songs[col].astype(int)
songs['artist_name'].fillna((songs['artist_name'].max())+1, inplace=True)
songs['artist_name'] = songs['artist_name'].astype(int)
songs['isrc_missing'].fillna(0, inplace=True)
songs['isrc_missing'] = songs['isrc_missing'].astype(int)
songs.to_csv('../training/songs_gbdt.csv', index=False)

In [10]:
songs['song_id_missing'] = np.isnan(songs['song_length'].values) * 1

columns = ['song_length', 'genre_id_cnt', 'artist_cnt', 'lyricist_cnt', 'composer_cnt', 'is_featured',\
           'artist_song_cnt', 'composer_song_cnt', 'lyricist_song_cnt', 'genre_song_cnt', 'song_rec_cnt', \
           'artist_rec_cnt', 'composer_rec_cnt', 'lyricist_rec_cnt', 'genre_rec_cnt', 'cn', 'xxx', 'year', \
           'cn_song_cnt', 'xxx_song_cnt', 'year_song_cnt', 'cn_rec_cnt', 'xxx_rec_cnt', 'year_rec_cnt', \
           'repeat_play_chance','plays'] + ['artist_component_%d'%i for i in range(16)]
for col in columns:
    songs[col].fillna(np.nanmean(songs[col]), inplace=True)

songs.to_csv('../training/songs_validation_nn.csv', index=False)