## 读入所有原始数据并合并

In [None]:

import pandas as pd
import numpy as np

train = pd.read_csv('input/train.csv')
members = pd.read_csv('input/members.csv')
songs = pd.read_csv('input/songs.csv')
song_extra_info = pd.read_csv('input/song_extra_info.csv')

data = train.merge(members, left_on='msno', right_on='msno', how='left') \
        .merge(songs, left_on='song_id', right_on='song_id', how='left') \
        .merge(song_extra_info, left_on='song_id', right_on='song_id', how='left')

## 连续型特征

In [None]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer

# 标准化
scalar = StandardScaler()
song_length_norm = pd.DataFrame(scalar.fit_transform(data[['song_length']]), columns=['song_length_norm'])

# 分位数变换
song_count = data.groupby('song_id').agg(song_count=('msno', 'count'))
song_count_feature = data[['song_id']].merge(song_count, left_on='song_id', right_on='song_id', how='left')
quant = QuantileTransformer(n_quantiles=10000)
song_count_quant = pd.DataFrame(quant.fit_transform(song_count_feature['song_count'].values.reshape(-1, 1)), columns=['song_count_quant'])

# 离散化
data['bd_dist'] = data['bd'].clip(0, 60).replace(0, np.nan).apply(lambda x: x // 5) # 处理异常值后分桶
bd_onehot = pd.get_dummies(data['bd_dist'], columns=['bd_dist'], dummy_na=True, prefix='bd', prefix_sep='_')

## 类别类特征

In [None]:
# onehot
onehot_columns = ['source_system_tab', 'source_screen_name', 
            'source_type', 'language', 'city', 'gender', 'registered_via']
data_onehot = pd.get_dummies(data[onehot_columns + ['target','msno']], \
                                dummy_na=True, columns=onehot_columns)

In [None]:

# 编号
from sklearn.preprocessing import LabelEncoder
data_label = []
for column in onehot_columns:
    le = LabelEncoder()
    data_label.append(le.fit_transform(data[column].astype(str)).reshape(-1, 1))

data_label_df = pd.DataFrame(np.concatenate(data_label, axis=1), columns=onehot_columns)

In [None]:
#hash方法

from sklearn.feature_extraction import FeatureHasher
from collections import Counter

hash_columns = ['genre_ids', 'artist_name', 'composer', 'lyricist']

hash_df_map = {}
feature_size = 16
for column in hash_columns:
    feature_hash = FeatureHasher(n_features=feature_size, input_type='dict')
    data_count = data[column].apply(lambda x: Counter(str(x).split('|'))).values
    hash_arr = feature_hash.fit_transform(data_count).todense()
    hash_df_map[column] = pd.DataFrame(hash_arr, columns=[column + '_' + str(i) for i in range(feature_size)])


In [None]:
# 用户，歌曲id编号
from sklearn.preprocessing import LabelEncoder
def encode_label(df, field):
    le = LabelEncoder()
    le.fit(df[field])
    df[field + "_raw"] = df[field]
    # 从1开始编号，编号0留作embedding padding用
    df[field] = le.transform(df[field]) + 1 
    
encode_label(data, 'msno')
encode_label(data, 'song_id')


## 划分数据集并保存

In [None]:
# 按用户切分：1. 方便复现 2. 避免数据穿越
import hashlib

data['global_index'] = data.index
data['is_train'] = data['msno_raw'] \
    .apply(lambda x: int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16) % 10 != 9)

data_merged = data[['global_index', 'target', 'is_train', 'msno', 'song_id', 'msno_raw', 'song_id_raw']] \
    .merge(song_length_norm, left_index=True, right_index=True) \
    .merge(song_count_quant, left_index=True, right_index=True) \
    .merge(bd_onehot, left_index=True, right_index=True) \
    .merge(data_onehot.drop(['target', 'msno'], axis=1), left_index=True, right_index=True) \
    .merge(data_label_df, left_index=True, right_index=True) \
    .merge(hash_df_map['genre_ids'], left_index=True, right_index=True) \
    .merge(hash_df_map['artist_name'], left_index=True, right_index=True) \
    .merge(hash_df_map['composer'], left_index=True, right_index=True) \
    .merge(hash_df_map['lyricist'], left_index=True, right_index=True) 
    
data_merged = data_merged.fillna(0)
data_merged.columns = [c.split('.')[0].replace(' ', '_') for c in data_merged.columns]

data_merged[data_merged['is_train'] == False].drop(['is_train'], axis=1).reset_index(drop=True).to_csv('./preprocess/val.csv', index=False)
data_merged[data_merged['is_train'] == True].drop(['is_train'], axis=1).reset_index(drop=True).to_csv('./preprocess/train.csv', index=False)