In [1]:
import catboost
from catboost import CatBoostClassifier, Pool, cv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from tqdm import tnrange,tqdm_notebook
import gc
import warnings
from scipy import sparse
warnings.filterwarnings("ignore")

# 原始数据

In [2]:
%%time
age_test = pd.read_csv('../age_test.csv', header = None, names = ['uId'])
age_train = pd.read_csv('../age_train.csv', header = None, names = ['uId','age_group'])
data = pd.concat([age_train,age_test], axis = 0,sort=True).reset_index()
data.drop(['index'],axis=1,inplace=True)
del age_test, age_train
gc.collect() 

CPU times: user 477 ms, sys: 186 ms, total: 663 ms
Wall time: 662 ms


In [3]:
%%time
user_basic_info = pd.read_csv('../user_basic_info.csv',header= None, names=['uId','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv('../user_behavior_info.csv', header = None, names =  ['uId','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
data = data.merge(user_basic_info)
data = data.merge(user_behavior_info)
del user_basic_info, user_behavior_info
gc.collect() 

CPU times: user 6.67 s, sys: 1.55 s, total: 8.23 s
Wall time: 8.22 s


# 特征工程

In [4]:
%%time
data['ramLeftCapacity'] = data['ramCapacity'] * data['ramLeftRation']
data['romLeftCapacity'] = data['romCapacity'] * data['romLeftRation']

for i in ['A','B','C','D','E','F']:
    data['{}FuncTimes'.format(i)] = round(abs(data['{}FuncTimes'.format(i)]))
    
data['romLeftRation'][data.romLeftRation>1] = 1
data['ramLeftRation'][data.ramLeftRation>1] = 1

CPU times: user 469 ms, sys: 246 ms, total: 715 ms
Wall time: 689 ms


In [5]:
%%time
#冷暖色系和颜色型号的降维
tmp_color = pd.DataFrame(list(data.color.unique()))
tmp_color.columns=['color']
tmp_color['color_short'] = ['银','黑','蓝','银','金','极光','蓝','金','金','紫','黑','银','蓝','黑白','金','金','金','金','紫','灰','黑','蓝','红','蓝','灰','红','银','黑','白','黑','金','备件颜色','金','灰','白','金','黑','银','青','金','金','灰','灰','黑','蓝','蓝','白','蓝','红','紫','红','蓝','棕','粉','金','灰', '红','黑','灰','灰','紫','金','白','粉','青','金','蓝','灰','绿','银','金','银','银','灰','银','白','白','蓝','红','白','白','蓝','蓝','白','黑','白','极光','红','金','白','紫','蓝','蓝','金','金','蓝','白','红','银白','金','蓝','黑','蓝','粉','紫','灰','蓝','灰','红','黑','金', '红','黑','白','蓝','金','红','灰','灰','蓝','银白','灰','紫','灰','黑','蓝','银','粉','蓝','粉', '黄','橘','红','紫','黄','紫']
tmp_color_warmcold = pd.DataFrame(list(tmp_color.color_short.unique()))
tmp_color_warmcold['warm_cold'] = ['冷','冷','冷','暖','暖','暖','冷','冷','暖','暖','未知','冷','暖','暖','冷','暖','暖','暖']
tmp_color_warmcold.columns=['color_short','warm_cold']
tmp_color = tmp_color.merge(tmp_color_warmcold, on='color_short', how='left')
data = data.merge(tmp_color, on='color', how='left')
del tmp_color, tmp_color_warmcold
gc.collect()

CPU times: user 1.35 s, sys: 989 ms, total: 2.34 s
Wall time: 2.34 s


In [6]:
%%time
#rom分箱和求ram rom 的对应型号的最大最小值和比例
bins = [4, 12, 24, 48, 96, 192, 384]
group_names = ['rom_8G', 'rom_16G', 'rom_32G', 'rom_64G', 'rom_128G', 'rom_256G']
cats = pd.cut(data.romCapacity, bins, labels = group_names)
rom_max = data.groupby(['prodName'])['romCapacity'].max().reset_index()
rom_max.columns=['prodName','romCapacity_max']
rom_min = data.groupby(['prodName'])['romCapacity'].min().reset_index()
rom_min.columns=['prodName','romCapacity_min']
ram_max = data.groupby(['prodName'])['ramCapacity'].max().reset_index()
ram_max.columns=['prodName','ramCapacity_max']
ram_min = data.groupby(['prodName'])['ramCapacity'].min().reset_index()
ram_min.columns=['prodName','ramCapacity_min']
tmp = rom_max.merge(rom_min, on='prodName', how='left').merge(ram_max, on='prodName', how='left').merge(ram_min, on='prodName', how='left')
res1 = data[['prodName','ramCapacity','romCapacity']]
res1 = res1.merge(tmp, on='prodName', how='left')
res1['rom_category'] = cats
res1['rom_category'] = list(res1['rom_category'])
res1['ram_max_ratio'] = res1['ramCapacity']/res1['ramCapacity_max']
res1['ram_min_ratio'] = res1['ramCapacity']/res1['ramCapacity_min']
res1['rom_max_ratio'] = res1['romCapacity']/res1['romCapacity_max']
res1['rom_min_ratio'] = res1['romCapacity']/res1['romCapacity_min']
res1.drop(['prodName','ramCapacity','rom_category'], axis=1 , inplace=True)
data = pd.concat([data,res1], axis=1)
del rom_max, rom_min, ram_max, ram_min, res1, tmp
gc.collect()

CPU times: user 1.94 s, sys: 852 ms, total: 2.79 s
Wall time: 2.71 s


In [7]:
%%time
#对应手机型号的平均开机次数和fontsize
tmp1 = data.groupby(['prodName'])['bootTimes'].mean().reset_index()
tmp1.columns=['prodName','bootTimes_mean']
data = data.merge(tmp1, on='prodName', how='left')
tmp2 = data.groupby(['prodName'])['fontSize'].mean().reset_index()
tmp2.columns=['prodName','fontSize_mean']
data = data.merge(tmp2, on='prodName', how='left')
del tmp1, tmp2

CPU times: user 5.22 s, sys: 5.53 s, total: 10.8 s
Wall time: 10.7 s


In [8]:
%%time
#对usage表进行groupby处理
user_app_usage = pd.read_csv('../user_app_usage.csv',chunksize=500000,names=['uId','appId','duration','times','use_date'])
dist = []
dist1 = []
dist2 = []
for chunk in tqdm_notebook(user_app_usage):
    tmp = chunk.groupby(['uId','use_date'])['duration','times'].sum().reset_index().values
    tmp1 = chunk.groupby(['uId','appId']).agg({'duration': 'sum','times': 'sum', 'use_date': 'count'}).reset_index().values
    tmp2 = chunk.groupby(['uId','use_date'])['appId'].count().reset_index().values

    dist.extend(tmp)
    dist1.extend(tmp1)
    dist2.extend(tmp2)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 10min 8s, sys: 24.2 s, total: 10min 33s
Wall time: 11min 9s


In [9]:
%%time
#统计usage表中每周各天的使用d时长和使用次数
user_app_usage_dt = pd.DataFrame(dist)
user_app_usage_dt.columns=['uId','use_date','date_duration','date_times']
user_app_usage_res = user_app_usage_dt.groupby(['uId','use_date'])['date_duration','date_times'].sum().reset_index()
user_app_usage_res['use_date'] = pd.to_datetime(user_app_usage_res['use_date'], errors='coerce')
user_app_usage_res['use_date'] = user_app_usage_res.use_date.dt.weekday
user_app_usage_res = user_app_usage_res.rename(columns = {'use_date':'use_date_weekday'})
user_app_usage_last = user_app_usage_res.groupby(['uId','use_date_weekday'])['date_duration','date_times'].sum().reset_index()
tmp = user_app_usage_last.pivot(index='uId',values=['date_duration','date_times'],columns='use_date_weekday')
tmp = tmp.reset_index()
tmp.columns = ['uId','date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday',
              'date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']
data = data.merge(tmp, on='uId', how='left')
del user_app_usage_dt, user_app_usage_res, user_app_usage_last, tmp
gc.collect()

CPU times: user 1min 56s, sys: 30.4 s, total: 2min 27s
Wall time: 2min 26s


In [10]:
%%time
#usage表中统计一个月三十天每天的开机次数和开机时间
user_app_usage_dt = pd.DataFrame(dist)
user_app_usage_dt.columns=['uId','use_date','date_duration','date_times']
user_app_usage_dt = user_app_usage_dt.groupby(['uId','use_date'])['date_duration','date_times'].sum().reset_index()
duration_per_day_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='date_duration')
times_per_day_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='date_times')
user_app_usage_dt = pd.concat([duration_per_day_dt,times_per_day_dt],axis=1).reset_index()
user_app_usage_dt.columns=['total_feat_{}'.format(i) for i in range(61)]
user_app_usage_dt = user_app_usage_dt.rename(columns={'total_feat_0':'uId'})
data = data.merge(user_app_usage_dt,on='uId',how='left') 
del dist, user_app_usage_dt, duration_per_day_dt, times_per_day_dt
gc.collect()

CPU times: user 2min 25s, sys: 41.7 s, total: 3min 7s
Wall time: 3min 7s


In [11]:
%%time
#对usage表中统计各个app种类的总时长、次数和天数
user_app_usage_dt = pd.DataFrame(dist1)
user_app_usage_dt.columns=['uId','appId','date_duration','date_times','use_date']
user_app_usage = user_app_usage_dt.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()
user_app_usage.columns=['uId','appId','total_duration','total_times','used_days']
app_info = pd.read_csv('../app_info.csv', header = None, names = ['appId','category'])
app_data = user_app_usage.merge(app_info,on='appId',how='left')
tmp = app_data.groupby(['uId','category'])['total_duration'].sum().reset_index()
tmp0 = tmp.pivot(index='uId',values='total_duration',columns='category')
tmp0.columns = ['主题个性_total_duration','主题铃声_total_duration', '休闲娱乐_total_duration', '休闲游戏_total_duration', '休闲益智_total_duration', '体育射击_total_duration', '体育竞速_total_duration', '便捷生活_total_duration', '儿童_total_duration', '出行导航_total_duration',
       '动作冒险_total_duration', '动作射击_total_duration', '医疗健康_total_duration', '合作壁纸*_total_duration', '商务_total_duration', '图书阅读_total_duration', '学习办公_total_duration', '实用工具_total_duration', '影音娱乐_total_duration',
       '拍摄美化_total_duration', '教育_total_duration', '新闻阅读_total_duration', '旅游住宿_total_duration', '棋牌天地_total_duration', '棋牌桌游_total_duration', '模拟游戏_total_duration', '汽车_total_duration', '电子书籍_total_duration',
       '益智棋牌_total_duration', '社交通讯_total_duration', '策略游戏_total_duration', '经营策略_total_duration', '网络游戏_total_duration', '美食_total_duration', '表盘个性_total_duration', '角色扮演_total_duration', '角色游戏_total_duration',
       '购物比价_total_duration', '运动健康_total_duration', '金融理财_total_duration']
tmp = app_data.groupby(['uId','category'])['total_times'].sum().reset_index()
tmp1 = tmp.pivot(index='uId',values='total_times',columns='category')
tmp1.columns = ['主题个性_total_times','主题铃声_total_times', '休闲娱乐_total_times', '休闲游戏_total_times', '休闲益智_total_times', '体育射击_total_times', '体育竞速_total_times', '便捷生活_total_times', '儿童_total_times', '出行导航_total_times',
       '动作冒险_total_times', '动作射击_total_times', '医疗健康_total_times', '合作壁纸*_total_times', '商务_total_times', '图书阅读_total_times', '学习办公_total_times', '实用工具_total_times', '影音娱乐_total_times',
       '拍摄美化_total_times', '教育_total_times', '新闻阅读_total_times', '旅游住宿_total_times', '棋牌天地_total_times', '棋牌桌游_total_times', '模拟游戏_total_times', '汽车_total_times', '电子书籍_total_times',
       '益智棋牌_total_times', '社交通讯_total_times', '策略游戏_total_times', '经营策略_total_times', '网络游戏_total_times', '美食_total_times', '表盘个性_total_times', '角色扮演_total_times', '角色游戏_total_times',
       '购物比价_total_times', '运动健康_total_times', '金融理财_total_times']
tmp = app_data.groupby(['uId','category'])['used_days'].sum().reset_index()
tmp2 = tmp.pivot(index='uId',values='used_days',columns='category')
tmp2.columns = ['主题个性_used_days','主题铃声_used_days','休闲娱乐_used_days', '休闲游戏_used_days', '休闲益智_used_days', '体育射击_used_days', '体育竞速_used_days', '便捷生活_used_days', '儿童_used_days', '出行导航_used_days',
       '动作冒险_used_days', '动作射击_used_days', '医疗健康_used_days', '合作壁纸*_used_days', '商务_used_days', '图书阅读_used_days', '学习办公_used_days', '实用工具_used_days', '影音娱乐_used_days',
       '拍摄美化_used_days', '教育_used_days', '新闻阅读_used_days', '旅游住宿_used_days', '棋牌天地_used_days', '棋牌桌游_used_days', '模拟游戏_used_days', '汽车_used_days', '电子书籍_used_days',
       '益智棋牌_used_days', '社交通讯_used_days', '策略游戏_used_days', '经营策略_used_days', '网络游戏_used_days', '美食_used_days', '表盘个性_used_days', '角色扮演_used_days', '角色游戏_used_days',
       '购物比价_used_days', '运动健康_used_days', '金融理财_used_days']
sum_usage = pd.concat([tmp0,tmp1,tmp2],axis=1).reset_index()
data = data.merge(sum_usage,on='uId',how='left') 
del user_app_usage_dt, user_app_usage, app_info, app_data, tmp, tmp0, tmp1, tmp2, sum_usage
gc.collect()

CPU times: user 4min 27s, sys: 1min 27s, total: 5min 54s
Wall time: 5min 54s


In [12]:
%%time
#统计usage表中30天每天的使用app总数
user_app_usage_dt = pd.DataFrame(dist2)
user_app_usage_dt.columns=['uId','use_date','app_count']
user_app_usage_dt = user_app_usage_dt.groupby(['uId','use_date'])['app_count'].sum().reset_index()
user_app_usage_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='app_count').reset_index()
user_app_usage_dt.columns=['total_app_count_{}'.format(i) for i in range(31)]
user_app_usage_dt = user_app_usage_dt.rename(columns={'total_app_count_0':'uId'})
data = data.merge(user_app_usage_dt,on='uId',how='left') 
del dist2, user_app_usage_dt
gc.collect()

CPU times: user 2min 14s, sys: 48 s, total: 3min 2s
Wall time: 3min 1s


## MLP降维

In [13]:
%%time
#从usage表中提取top5000duration的appId
user_app_usage_dt = pd.DataFrame(dist1)
user_app_usage_dt.columns=['uId','appId','date_duration','date_times','use_date']
user_app_stat = user_app_usage_dt.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()
user_app_stat.columns=['uId','appId','total_duration','total_times','used_days']
train_age = pd.read_csv('../age_train.csv', names=['uId', 'age'])
train_age_stat = train_age.merge(user_app_stat, how='left', on='uId')
del train_age_stat['uId']
train_age_stat = train_age_stat.groupby(['age', 'appId']).sum().reset_index()
train_age_stat.sort_values(by=['age', 'total_duration'], inplace=True, ascending=False)
train_age_stat_top5000 = train_age_stat.groupby('age').head(5000).reset_index()
top5000app = train_age_stat_top5000['appId'].unique()
user_app_5000 = user_app_stat[user_app_stat['appId'].isin(top5000app)][['uId', 'appId']]
def concat(ser):
    return '#'.join([a for a in ser])
app_usage5000 = user_app_5000.groupby('uId')['appId'].apply(concat).reset_index()
#app_usage5000.to_hdf('../feature/app_usage5000.h5', key='data') #给MLP降维文件用

CPU times: user 5min 34s, sys: 43.1 s, total: 6min 18s
Wall time: 6min 17s


In [14]:
%%time
user_app_actived = pd.read_csv('../user_app_actived.csv', header = None, names =['uId','appId'])
app_usage5000.columns=['uId', 'usage_appId']
data = data.merge(user_app_actived, on='uId', how='left')
data = data.merge(app_usage5000, on='uId', how='left')
del user_app_actived, app_usage5000
gc.collect()

CPU times: user 39.5 s, sys: 46.6 s, total: 1min 26s
Wall time: 1min 27s


In [15]:
%%time
#取actived appID对应的MLP中间层特征
actived_app = CountVectorizer(token_pattern='a\d+',binary=True).fit_transform(data['appId'])
weight = np.load('./weight_bias/Xapp_weight1_dense1.npy')
bias = np.load('./weight_bias/Xapp_bias1_dense1.npy')
actived_app_I = actived_app.dot(weight)+bias
weight = np.load('./weight_bias/Xapp_weight1_dense2.npy')
bias = np.load('./weight_bias/Xapp_bias1_dense2.npy')
actived_app_II = actived_app_I.dot(weight)+bias
weight = np.load('./weight_bias/Xapp_weight1_dense3.npy')
bias = np.load('./weight_bias/Xapp_bias1_dense3.npy')
actived_app_III = actived_app_II.dot(weight)+bias
active_app_df_I = pd.DataFrame(actived_app_I, columns=['actived_app_{}'.format(i) for i in range(actived_app_I.shape[1])])
active_app_df_II = pd.DataFrame(actived_app_II, columns=['actived_app_II_{}'.format(i) for i in range(actived_app_II.shape[1])])
active_app_df_III = pd.DataFrame(actived_app_III, columns=['actived_app_III_{}'.format(i) for i in range(actived_app_III.shape[1])])
data = pd.concat([data,active_app_df_I,active_app_df_II,active_app_df_III],axis=1)
del actived_app, actived_app_I, actived_app_II, actived_app_III
del weight, bias
del active_app_df_I, active_app_df_II, active_app_df_III
gc.collect()

CPU times: user 1min 27s, sys: 20.3 s, total: 1min 47s
Wall time: 1min 38s


In [16]:
%%time
#取usage appID对应的MLP中间层特征
X_usage = CountVectorizer(token_pattern='a\d+', binary=True).fit_transform(data['usage_appId'].fillna('-1'))
X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense1.npy')
X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense1.npy')
X_usage_I = X_usage.dot(X_usage_weight) + X_usage_bias
X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense2.npy')
X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense2.npy')
X_usage_II = X_usage_I.dot(X_usage_weight) + X_usage_bias
X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense3.npy')
X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense3.npy')
X_usage_III = X_usage_II.dot(X_usage_weight) + X_usage_bias
X_usage_df_I = pd.DataFrame(X_usage_I, columns=['X_usage_I_{}'.format(i) for i in range(X_usage_I.shape[1])])
X_usage_df_II = pd.DataFrame(X_usage_II, columns=['X_usage_II_{}'.format(i) for i in range(X_usage_II.shape[1])])
X_usage_df_III = pd.DataFrame(X_usage_III, columns=['X_usage_III_{}'.format(i) for i in range(X_usage_III.shape[1])])
data = pd.concat([data, X_usage_df_I, X_usage_df_II, X_usage_df_III], axis=1)
del X_usage, X_usage_I, X_usage_II, X_usage_III
del X_usage_weight, X_usage_bias
del X_usage_df_I, X_usage_df_II, X_usage_df_III
gc.collect()

CPU times: user 59.9 s, sys: 18.5 s, total: 1min 18s
Wall time: 1min 12s


In [17]:
%%time
#usage表中app的所有duration time days作svd降维
user_app_stat = pd.DataFrame(dist1)
user_app_stat.columns=['uId','appId','date_duration','date_times','use_date']
user_app_stat = user_app_stat.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()
user_app_stat.columns=['uId','appId','total_duration','total_times','used_days']

user_app_actived = pd.read_csv('../user_app_actived.csv', names=['uId', 'appId'])
cnt_vec = CountVectorizer(token_pattern='a\d+', binary=True).fit(user_app_actived['appId'])
actived_app = list(cnt_vec.vocabulary_.keys())
user_actived_app_stat = user_app_stat[user_app_stat['appId'].isin(actived_app)]

age_train = pd.read_csv('../age_train.csv', names=['uId', 'age_group'])
age_test = pd.read_csv('../age_test.csv', names=['uId'])

all_uId = pd.concat([age_train, age_test], sort=True)
all_uId = all_uId[['uId']]
all_uId['idx'] = np.arange(len(all_uId))

all_user_actived_app_stat = all_uId.merge(user_actived_app_stat, 'left', 'uId').fillna(-1)

all_user_actived_app_stat['appId_lbl'] = LabelEncoder().fit_transform(all_user_actived_app_stat['appId'].astype(str))

shape = (len(all_uId), all_user_actived_app_stat['appId_lbl'].nunique())

X_duration = sparse.csr_matrix((all_user_actived_app_stat['total_duration'].astype(int),
                                (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape)

X_times = sparse.csr_matrix((all_user_actived_app_stat['total_times'].astype(int),
                                (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape)                                
X_days = sparse.csr_matrix((all_user_actived_app_stat['used_days'].astype(int),
                                (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape)

X_duration_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_duration)
X_times_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_times)
X_days_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_days)

X_duration_svd_I = pd.DataFrame(X_duration_svd, columns=['x_duration_{}'.format(i) for i in range(30)])
X_times_svd_I = pd.DataFrame(X_times_svd, columns=['x_times_{}'.format(i) for i in range(30)])
X_days_svd_I = pd.DataFrame(X_days_svd, columns=['x_days_{}'.format(i) for i in range(30)])

X_duration_svd_I['uId'] = all_uId['uId'].values
X_times_svd_I['uId'] = all_uId['uId'].values
X_days_svd_I['uId'] = all_uId['uId'].values

data = data.merge(X_duration_svd_I, how='left', on='uId')
data = data.merge(X_times_svd_I, how='left', on='uId')
data = data.merge(X_days_svd_I, how='left', on='uId')

del dist1, X_duration, X_times, X_days, X_duration_svd, X_times_svd, X_days_svd
del user_actived_app_stat, user_app_actived, user_app_stat, age_train, age_test, all_uId, all_user_actived_app_stat
gc.collect()

CPU times: user 39min 9s, sys: 6min 7s, total: 45min 17s
Wall time: 20min 7s


# test zone

In [18]:
data['user_actived_app_count'] = data['appId'].apply(lambda x: len(x.split('#')))

In [2]:
data = pd.read_hdf('./data_all.hdf',key='data')

In [19]:
week_day_conut_feature = ['date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday']
data['date_duration_week_max'] = data[week_day_conut_feature].max(axis=1)
data['date_duration_week_min'] = data[week_day_conut_feature].min(axis=1)
data['date_duration_week_sum'] = data[week_day_conut_feature].sum(axis=1)
data['date_duration_week_std'] = data[week_day_conut_feature].std(axis=1)

In [20]:
week_day_conut_feature = ['date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']
data['date_times_week_max'] = data[week_day_conut_feature].max(axis=1)
data['date_times_week_min'] = data[week_day_conut_feature].min(axis=1)
data['date_times_week_sum'] = data[week_day_conut_feature].sum(axis=1)
data['date_times_week_std'] = data[week_day_conut_feature].std(axis=1)

In [18]:
tmp = data['appId'].apply(lambda x : x.split('#'))

In [24]:
tmp = tmp.reset_index()

In [None]:
dist = []
for i in tnrange(tmp.shape[0]):
    temp = pd.DataFrame(tmp.appId[i])
    temp['uId'] = tmp.uId[i]
    dist.extend(temp.values)

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))

In [115]:
actived_app_df = pd.DataFrame(dist, columns=['appId','uId'])

In [116]:
actived_app_df = actived_app_df.merge(app_info, on='appId', how='left')

In [117]:
actived_app_df = actived_app_df.fillna('未知')

In [11]:
actived_app_df_group_uid = actived_app_df.groupby(['uId','category'])['appId'].count().reset_index()

In [23]:
actived_app_df_group_uid.appId = 1

In [12]:
tmp = actived_app_df_group_uid.pivot(index='uId',values=['appId'],columns='category')

In [13]:
tmp = tmp.reset_index()

In [26]:
tmp.columns=['uId','主题个性', '休闲游戏', '休闲益智', '体育竞速', '便捷生活', '儿童', '出行导航', '动作冒险', '动作射击', '商务', '图书阅读', '学习办公', '实用工具', '影音娱乐', '拍摄美化', '教育', '新闻阅读', '旅游住宿', '未知', '棋牌天地', '棋牌桌游', '汽车', '益智棋牌', '社交通讯', '经营策略', '网络游戏', '美食', '表盘个性', '角色扮演', '购物比价', '运动健康', '金融理财']

In [14]:
tmp.columns=['uId','主题个性s', '休闲游戏s', '休闲益智s', '体育竞速s', '便捷生活s', '儿童s', '出行导航s', '动作冒险s', '动作射击s', '商务s', '图书阅读s', '学习办公s', '实用工具s', '影音娱乐s', '拍摄美化s', '教育s', '新闻阅读s', '旅游住宿s', '未知s', '棋牌天地s', '棋牌桌游s', '汽车s', '益智棋牌s', '社交通讯s', '经营策略s', '网络游戏s', '美食s', '表盘个性s', '角色扮演s', '购物比价s', '运动健康s', '金融理财s']

In [16]:
tmp = tmp.fillna(0)

In [18]:
data =  data.merge(tmp, on='uId', how='left')

In [68]:
app_info = pd.read_csv('../app_info.csv', names=['appId', 'category'])

In [40]:
test_cate = app_info[app_info['category']=='拍摄美化'].uId.values

In [10]:
actived_app_df = pd.read_hdf('./feature/actived_app_cate.hdf',key='data')

# 模型训练

In [29]:
cate_features = ['city','prodName','color','ct','rom_category','color_short','warm_cold','carrier','gender','fontSize','os']
for feat in tqdm_notebook(cate_features):
    data[feat] = LabelEncoder().fit_transform(data[feat].fillna('-1').apply(str))

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [19]:
%%time

origin_num_feature = ['ramLeftCapacity','romLeftCapacity','city','prodName','color','ct','color_short','warm_cold',
                      'carrier','gender','ramCapacity','ramLeftRation','romCapacity','romLeftRation','fontSize',
                       'os','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes',
                       'FFuncSum']

rom_ram_feature = ['rom_category', 'romCapacity_max', 'romCapacity_min', 'rom_max_ratio', 'rom_min_ratio',
                'ramCapacity_max','ramCapacity_min','ram_max_ratio','ram_min_ratio']

X_app_I = ['actived_app_{}'.format(i) for i in range(32)]
X_app_II = ['actived_app_II_{}'.format(i) for i in range(16)]
X_app_III = ['actived_app_III_{}'.format(i) for i in range(8)]
X_app = X_app_I + X_app_II + X_app_III

X_usage_I = ['X_usage_I_{}'.format(i) for i in range(32)]
X_usage_II = ['X_usage_II_{}'.format(i) for i in range(16)]
X_usage_III = ['X_usage_III_{}'.format(i) for i in range(8)]
X_usage = X_usage_I + X_usage_II + X_usage_III

X_duration_svd = ['x_duration_{}'.format(i) for i in range(30)]
X_times_svd = ['x_times_{}'.format(i) for i in range(30)]
X_days_svd = ['x_days_{}'.format(i) for i in range(30)]

used_days_feature = ['主题个性_used_days','主题铃声_used_days', '休闲娱乐_used_days', '休闲游戏_used_days', '休闲益智_used_days', '体育射击_used_days', '体育竞速_used_days', '便捷生活_used_days', '儿童_used_days', '出行导航_used_days', '动作冒险_used_days', '动作射击_used_days', '医疗健康_used_days', '合作壁纸*_used_days', '商务_used_days', '图书阅读_used_days', '学习办公_used_days', '实用工具_used_days', '影音娱乐_used_days','拍摄美化_used_days', '教育_used_days', '新闻阅读_used_days', '旅游住宿_used_days', '棋牌天地_used_days', '棋牌桌游_used_days', '模拟游戏_used_days', '汽车_used_days', '电子书籍_used_days','益智棋牌_used_days', '社交通讯_used_days', '策略游戏_used_days', '经营策略_used_days', '网络游戏_used_days', '美食_used_days', '表盘个性_used_days', '角色扮演_used_days', '角色游戏_used_days','购物比价_used_days', '运动健康_used_days', '金融理财_used_days']
total_times_feature = ['主题个性_total_times', '主题铃声_total_times','休闲娱乐_total_times', '休闲游戏_total_times', '休闲益智_total_times', '体育射击_total_times', '体育竞速_total_times', '便捷生活_total_times', '儿童_total_times', '出行导航_total_times','动作冒险_total_times', '动作射击_total_times', '医疗健康_total_times', '合作壁纸*_total_times', '商务_total_times', '图书阅读_total_times', '学习办公_total_times', '实用工具_total_times', '影音娱乐_total_times', '拍摄美化_total_times', '教育_total_times', '新闻阅读_total_times', '旅游住宿_total_times', '棋牌天地_total_times', '棋牌桌游_total_times', '模拟游戏_total_times', '汽车_total_times', '电子书籍_total_times', '益智棋牌_total_times', '社交通讯_total_times', '策略游戏_total_times', '经营策略_total_times', '网络游戏_total_times', '美食_total_times', '表盘个性_total_times', '角色扮演_total_times', '角色游戏_total_times','购物比价_total_times', '运动健康_total_times', '金融理财_total_times']
total_duration_feature = ['主题个性_total_duration','主题铃声_total_duration', '休闲娱乐_total_duration', '休闲游戏_total_duration', '休闲益智_total_duration', '体育射击_total_duration', '体育竞速_total_duration', '便捷生活_total_duration', '儿童_total_duration', '出行导航_total_duration', '动作冒险_total_duration', '动作射击_total_duration', '医疗健康_total_duration', '合作壁纸*_total_duration', '商务_total_duration', '图书阅读_total_duration', '学习办公_total_duration', '实用工具_total_duration', '影音娱乐_total_duration','拍摄美化_total_duration', '教育_total_duration', '新闻阅读_total_duration', '旅游住宿_total_duration', '棋牌天地_total_duration', '棋牌桌游_total_duration', '模拟游戏_total_duration', '汽车_total_duration', '电子书籍_total_duration','益智棋牌_total_duration', '社交通讯_total_duration', '策略游戏_total_duration', '经营策略_total_duration', '网络游戏_total_duration', '美食_total_duration', '表盘个性_total_duration', '角色扮演_total_duration', '角色游戏_total_duration','购物比价_total_duration', '运动健康_total_duration', '金融理财_total_duration']
week_day_conut_feature = ['date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday',
              'date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']

weekday_times_feat = ['weekday_0_total_times', 'weekday_1_total_times', 'weekday_2_total_times','weekday_3_total_times', 'weekday_4_total_times', 'weekday_5_total_times', 'weekday_6_total_times']
total_app_count_feat = ['total_app_count_{}'.format(i) for i in range(1,31)]
total_perday_feat = [ 'total_feat_{}'.format(i) for i in range(1,61)]
#******* Feature test***********#
test_feat = ['user_actived_app_count','date_duration_week_max','date_duration_week_min','date_duration_week_sum',
            'date_duration_week_std','date_times_week_max','date_times_week_min','date_times_week_sum','date_times_week_std']
actived_cate_haved = ['主题个性', '休闲游戏', '休闲益智', '体育竞速', '便捷生活', '儿童', '出行导航', '动作冒险', '动作射击', '商务', '图书阅读', '学习办公', '实用工具', '影音娱乐', '拍摄美化', '教育', '新闻阅读', '旅游住宿', '未知', '棋牌天地', '棋牌桌游', '汽车', '益智棋牌', '社交通讯', '经营策略', '网络游戏', '美食', '表盘个性', '角色扮演', '购物比价', '运动健康', '金融理财']
actived_cate_counts=['主题个性s', '休闲游戏s', '休闲益智s', '体育竞速s', '便捷生活s', '儿童s', '出行导航s', '动作冒险s', '动作射击s', '商务s', '图书阅读s', '学习办公s', '实用工具s', '影音娱乐s', '拍摄美化s', '教育s', '新闻阅读s', '旅游住宿s', '未知s', '棋牌天地s', '棋牌桌游s', '汽车s', '益智棋牌s', '社交通讯s', '经营策略s', '网络游戏s', '美食s', '表盘个性s', '角色扮演s', '购物比价s', '运动健康s', '金融理财s']
#******* Feature sum***********#
feature  =    origin_num_feature +rom_ram_feature  + used_days_feature + week_day_conut_feature + total_times_feature\
            + total_duration_feature +  total_app_count_feat + total_perday_feat  +X_app+X_usage\
            + X_duration_svd + X_times_svd + X_days_svd\
            + test_feat + actived_cate_haved + actived_cate_counts
#*********************************# 

test_index = np.isnan(data.age_group)
train_index = ~test_index
train_x = data[train_index][feature] 
train_y = data[train_index]['age_group']
test_x  = data[test_index][feature]


print('All features: train shape {}, test shape {}'.format(train_x.shape, test_x.shape))
print(len(feature))

All features: train shape (2010000, 532), test shape (502500, 532)
532
CPU times: user 10.1 s, sys: 21.5 s, total: 31.5 s
Wall time: 31.5 s


In [20]:
def label_smoothing(inputs, epsilon=0.1):
    K = 6
    return ((1-epsilon) * inputs) + (epsilon / K)

def label_smoothing_re(inputs, epsilon=0.1):
    K = 6
    return (inputs-epsilon/K)/(1-epsilon)

In [21]:
train_y = label_smoothing(train_y)

In [None]:
%%time
from sklearn.model_selection import train_test_split 
X_train, X_validation, y_train, y_validation = train_test_split(train_x, train_y, test_size=0.02, random_state=42)
del train_x, train_y
gc.collect()

CPU times: user 8.38 s, sys: 3.55 s, total: 11.9 s
Wall time: 11.9 s


In [None]:
%%time
cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']
train_pool = Pool(X_train, y_train, cat_features=cate_features)
eval_pool = Pool(X_validation, y_validation,cat_features=cate_features)
del X_train, y_train, y_validation
gc.collect()

CPU times: user 2min 25s, sys: 4.28 s, total: 2min 30s
Wall time: 2min 29s


In [2]:
data = pd.read_hdf('./data_all.hdf',key='data')

In [40]:
del data['romCapacity_x']

In [None]:
model_I = CatBoostClassifier(iterations=500000,
                           learning_rate=0.01,
                           eval_metric='Accuracy',
                           use_best_model=True,
                           random_seed=42,
                           logging_level='Verbose',
                           task_type='GPU',
                           devices='0:1:2',
                           early_stopping_rounds=10000,
                           loss_function='MultiClass',
                           depth=9,
                           #gpu_ram_part=0.5,
                           )
model_I.fit(train_pool, eval_set=eval_pool, verbose=100) #0.6509   #0.6498

0:	learn: 0.5591918	test: 0.5564925	best: 0.5564925 (0)	total: 140ms	remaining: 19h 28m 27s
100:	learn: 0.6012722	test: 0.5979602	best: 0.5979602 (100)	total: 11.8s	remaining: 16h 14m 35s
200:	learn: 0.6120175	test: 0.6087562	best: 0.6087562 (200)	total: 23.6s	remaining: 16h 16m 30s
300:	learn: 0.6183059	test: 0.6136567	best: 0.6136567 (300)	total: 35.1s	remaining: 16h 11m 32s
400:	learn: 0.6229729	test: 0.6183085	best: 0.6183085 (396)	total: 46.7s	remaining: 16h 10m 35s
500:	learn: 0.6263534	test: 0.6217910	best: 0.6217910 (500)	total: 58.5s	remaining: 16h 11m 25s
600:	learn: 0.6292685	test: 0.6252736	best: 0.6252736 (600)	total: 1m 10s	remaining: 16h 19m 45s
700:	learn: 0.6316565	test: 0.6270896	best: 0.6270896 (700)	total: 1m 23s	remaining: 16h 25m 36s
800:	learn: 0.6338308	test: 0.6291045	best: 0.6291045 (800)	total: 1m 35s	remaining: 16h 29m 53s
900:	learn: 0.6358123	test: 0.6298507	best: 0.6300995 (875)	total: 1m 47s	remaining: 16h 33m 58s
1000:	learn: 0.6375678	test: 0.6310945	b

8400:	learn: 0.6777571	test: 0.6467164	best: 0.6468408 (6395)	total: 17m 4s	remaining: 16h 38m 44s
8500:	learn: 0.6780912	test: 0.6466667	best: 0.6468657 (8416)	total: 17m 15s	remaining: 16h 37m 52s
8600:	learn: 0.6783800	test: 0.6467910	best: 0.6468657 (8416)	total: 17m 27s	remaining: 16h 37m 4s
8700:	learn: 0.6787065	test: 0.6468408	best: 0.6469652 (8660)	total: 17m 38s	remaining: 16h 36m 29s
8800:	learn: 0.6789659	test: 0.6467413	best: 0.6469900 (8706)	total: 17m 50s	remaining: 16h 35m 47s
8900:	learn: 0.6792964	test: 0.6466418	best: 0.6469900 (8706)	total: 18m 1s	remaining: 16h 34m 47s
9000:	learn: 0.6795822	test: 0.6468408	best: 0.6469900 (8706)	total: 18m 13s	remaining: 16h 33m 48s
9100:	learn: 0.6799152	test: 0.6468905	best: 0.6469900 (8706)	total: 18m 24s	remaining: 16h 33m 10s
9200:	learn: 0.6801868	test: 0.6470149	best: 0.6470149 (9200)	total: 18m 36s	remaining: 16h 32m 20s
9300:	learn: 0.6804630	test: 0.6467662	best: 0.6470398 (9203)	total: 18m 47s	remaining: 16h 31m 44s
940

16600:	learn: 0.7004361	test: 0.6475373	best: 0.6478358 (16150)	total: 32m 33s	remaining: 15h 47m 56s
16700:	learn: 0.7006320	test: 0.6476368	best: 0.6478358 (16150)	total: 32m 44s	remaining: 15h 47m 28s
16800:	learn: 0.7009118	test: 0.6472637	best: 0.6478358 (16150)	total: 32m 55s	remaining: 15h 46m 55s
16900:	learn: 0.7012037	test: 0.6473881	best: 0.6478358 (16150)	total: 33m 7s	remaining: 15h 46m 38s
17000:	learn: 0.7014712	test: 0.6474876	best: 0.6478358 (16150)	total: 33m 18s	remaining: 15h 46m 15s
17100:	learn: 0.7017179	test: 0.6475622	best: 0.6478358 (16150)	total: 33m 29s	remaining: 15h 45m 49s
17200:	learn: 0.7019738	test: 0.6475622	best: 0.6478358 (16150)	total: 33m 41s	remaining: 15h 45m 27s
17300:	learn: 0.7022089	test: 0.6477861	best: 0.6478358 (16150)	total: 33m 52s	remaining: 15h 45m 1s
17400:	learn: 0.7025018	test: 0.6479104	best: 0.6480100 (17384)	total: 34m 3s	remaining: 15h 44m 40s
17500:	learn: 0.7027353	test: 0.6478109	best: 0.6480100 (17384)	total: 34m 15s	remain

24700:	learn: 0.7211849	test: 0.6488557	best: 0.6489055 (24544)	total: 47m 51s	remaining: 15h 21m 1s
24800:	learn: 0.7214078	test: 0.6489552	best: 0.6490547 (24781)	total: 48m 3s	remaining: 15h 20m 40s
24900:	learn: 0.7216687	test: 0.6490050	best: 0.6490547 (24781)	total: 48m 14s	remaining: 15h 20m 28s
25000:	learn: 0.7218966	test: 0.6490299	best: 0.6490796 (24997)	total: 48m 26s	remaining: 15h 20m 11s
25100:	learn: 0.7221845	test: 0.6489552	best: 0.6491791 (25019)	total: 48m 37s	remaining: 15h 19m 59s
25200:	learn: 0.7224348	test: 0.6490547	best: 0.6491791 (25019)	total: 48m 49s	remaining: 15h 19m 47s
25300:	learn: 0.7226876	test: 0.6487313	best: 0.6491791 (25019)	total: 49m	remaining: 15h 19m 35s
25400:	learn: 0.7229343	test: 0.6489801	best: 0.6491791 (25019)	total: 49m 12s	remaining: 15h 19m 28s
25500:	learn: 0.7231846	test: 0.6490299	best: 0.6491791 (25019)	total: 49m 24s	remaining: 15h 19m 14s
25600:	learn: 0.7234293	test: 0.6489801	best: 0.6491791 (25019)	total: 49m 35s	remaining

32800:	learn: 0.7413702	test: 0.6490547	best: 0.6495274 (30757)	total: 1h 3m 22s	remaining: 15h 2m 36s
32900:	learn: 0.7416164	test: 0.6491294	best: 0.6495274 (30757)	total: 1h 3m 33s	remaining: 15h 2m 24s
33000:	learn: 0.7418895	test: 0.6491542	best: 0.6495274 (30757)	total: 1h 3m 45s	remaining: 15h 2m 14s
33100:	learn: 0.7421266	test: 0.6491542	best: 0.6495274 (30757)	total: 1h 3m 57s	remaining: 15h 2m 7s
33200:	learn: 0.7423942	test: 0.6490547	best: 0.6495274 (30757)	total: 1h 4m 8s	remaining: 15h 1m 55s
33300:	learn: 0.7426490	test: 0.6491294	best: 0.6495274 (30757)	total: 1h 4m 20s	remaining: 15h 1m 44s
33400:	learn: 0.7428972	test: 0.6491542	best: 0.6495274 (30757)	total: 1h 4m 32s	remaining: 15h 1m 35s
33500:	learn: 0.7431359	test: 0.6491045	best: 0.6495274 (30757)	total: 1h 4m 44s	remaining: 15h 1m 26s
33600:	learn: 0.7433760	test: 0.6494279	best: 0.6495274 (30757)	total: 1h 4m 55s	remaining: 15h 1m 13s
33700:	learn: 0.7436369	test: 0.6490547	best: 0.6495274 (30757)	total: 1h 5

In [None]:
pred = model_I.predict(X_validation)

In [109]:
pred = model_I.predict(test_x)
pred = label_smoothing_re(pred)

In [None]:
#pred = model_I.predict(test_x)
result = pd.DataFrame()
result['id'] = data[test_index]['uId']
result['label'] = pred.astype(int)
result.to_csv('./out/submission.csv', index=False)
print('Save Done.')

In [None]:
pred_val = model.predict_proba(X_validation)
pred_test = model.predict_proba(test_x)
np.save("./out/proba_val_{}.npy".format(round(model_I.best_score_['validation']['Accuracy'],5)), pred_val)
np.save("./out/proba_test_{}.npy".format(round(model_I.best_score_['validation']['Accuracy'],5)), pred_test)

In [None]:
#特征&数据随机sample
import random
frac_axis1 = [0.7,0.75,0.8,0.85,0.9]
frac_axis0 = [0.9,0.85,0.8,0.75,0.7]
for index, i in enumerate(frac_axis1):
    j = frac_axis0[index]
    test_index = np.isnan(data.age_group)
    train_index = ~test_index
    train_x = data[train_index][feature] 
    train_y = data[train_index]['age_group']
    test_x  = data[test_index][feature]
    rand = random.randint(0,2019)
    train_x = train_x.sample(frac=i, replace=True, random_state=rand,axis=1)
    test_x  = test_x.sample(frac=i, replace=True, random_state=rand,axis=1)
    from sklearn.model_selection import train_test_split
    X_train, X_validation, y_train, y_validation = train_test_split(train_x, train_y, test_size=0.02, random_state=42)
    del train_x
    gc.collect()
    rand = random.randint(0,2019)
    X_train  = X_train.sample(frac=j, replace=True, random_state=rand,axis=0)
    y_train  = y_train.sample(frac=j, replace=True, random_state=rand,axis=0)
    train_pool = Pool(X_train, y_train)
    eval_pool = Pool(X_validation, y_validation)
    del X_train
    #del X_validation
    del y_train
    del y_validation
    gc.collect()
    model = CatBoostClassifier(iterations=300000,
                           learning_rate=0.01,
                           eval_metric='Accuracy',
                           use_best_model=True,
                           random_seed=2019,
                           logging_level='Verbose',
                           task_type='GPU',
                           devices='0',
                           early_stopping_rounds=5000,
                           loss_function='MultiClass',
                           depth=8,
                           #gpu_ram_part=0.3,
                           )
    model.fit(train_pool, eval_set=eval_pool, verbose=100) #0.6486
    pred_val = model.predict_proba(X_validation)
    pred_test = model.predict_proba(test_x)
    np.save("./out/proba_val_{}.npy".format(round(model.best_score_['validation']['Accuracy'],5)), pred_val)
    np.save("./out/proba_test_{}.npy".format(round(model.best_score_['validation']['Accuracy'],5)), pred_test)

In [None]:
#5-fold 交叉验证
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

cv_train = np.zeros((X_train.shape[0],6))
cv_test = np.zeros((test_x.shape[0],6))
cv_val = np.zeros((X_validation.shape[0],6))

cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']

for index,(train_idx,valid_idx) in tqdm_notebook(enumerate(skf.split(train_x, train_y))):
    print("Fold_{}_started".format(index))
    X_KFold_train, y_KFold_train, X_KFold_valid, y_KFold_valid = train_x.iloc[train_idx], train_y.iloc[train_idx], train_x.iloc[valid_idx],train_y.iloc[valid_idx]
    train_pool = Pool(X_KFold_train, y_KFold_train, cat_features=cate_features)
    valid_pool = Pool(X_KFold_valid, y_KFold_valid, cat_features=cate_features)

    del X_KFold_train
    del y_KFold_train
    gc.collect()
    model = CatBoostClassifier(iterations=500000,
                           learning_rate=0.01,
                           eval_metric='Accuracy',
                           use_best_model=True,
                           random_seed=47,
                           logging_level='Verbose',
                           task_type='GPU',
                           devices='0:1',
                           early_stopping_rounds=10000,
                           loss_function='MultiClass',
                           depth=8
                           )
    model.fit(train_pool, eval_set=valid_pool, verbose=100) #0.648
    cv_train[valid_idx] = model.predict_proba(X_KFold_valid)
    cv_val[valid_idx] += model.predict_proba(X_validation)/5
    cv_test += model.predict_proba(test_x)/5
    print("Fold_{}_result_saved".format(index))
    np.save("./out/proba_val_{}.npy".format(round(model.best_score_['validation']['Accuracy'],5)), pred_val)
    np.save("./out/proba_test_{}.npy".format(round(model.best_score_['validation']['Accuracy'],5)), pred_test)

In [None]:
#5-fold 交叉验证
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)



cv_result = np.zeros((X_train.shape[0],6))
cv_test = np.zeros((test_x.shape[0],6))
cv_val = np.zeros((X_validation.shape[0],6))
cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']
eval_pool = Pool(X_validation, y_validation, cat_features=cate_features)

for index,(train_idx,valid_idx) in tqdm_notebook(enumerate(skf.split(X_train, y_train))):
    print("Fold_{}_started".format(index))
    X_KFold_train, y_KFold_train, X_KFold_valid = X_train.iloc[train_idx], y_train.iloc[train_idx], X_train.iloc[valid_idx]
    train_pool = Pool(X_KFold_train, y_KFold_train, cat_features=cate_features)
    del X_KFold_train
    del y_KFold_train
    gc.collect()
    model = CatBoostClassifier(iterations=500000,
                           learning_rate=0.01,
                           eval_metric='Accuracy',
                           use_best_model=True,
                           random_seed=47,
                           logging_level='Verbose',
                           task_type='GPU',
                           devices='0:1',
                           early_stopping_rounds=10000,
                           loss_function='MultiClass',
                           depth=8
                           )
    model.fit(train_pool, eval_set=eval_pool, verbose=1000) #0.648
    cv_result[valid_idx] = model.predict_proba(X_KFold_valid)
    cv_val += model.predict_proba(X_validation)/5
    cv_test += model.predict_proba(test_x)/5
    print("Fold_{}_result_saved".format(index))