# 用户人口属性预测
手机用户的人口属性（如性别、年龄、常驻地等）数据一方面可以被用于个性化推荐服务，提升用户体验，另一方面可以用于手机用户群画像分析，帮助厂商了解产品的人群定位，优化产品设计。

In [None]:
from __future__ import division

import numpy as np
import xgboost as xgb
import pickle
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib

from tqdm import tqdm

import gc

import warnings
import random

from sklearn.model_selection  import train_test_split

from scipy import sparse
from scipy.sparse import csr_matrix, hstack, vstack
warnings.filterwarnings('ignore')

In [None]:
age_test = pd.read_csv("../data/age_test.csv", header = None)
age_train = pd.read_csv("../data/age_train.csv", header = None)

In [None]:
trainData = sparse.load_npz('../trainTestData/trainData13100.npz')
gc.collect()

In [None]:
k_fold = [[0, int(trainData.shape[0]*0.2)],
          [int(trainData.shape[0]*0.2), int(trainData.shape[0]*0.4)], 
          [int(trainData.shape[0]*0.4), int(trainData.shape[0]*0.6)],
          [int(trainData.shape[0]*0.6), int(trainData.shape[0]*0.8)],
          [int(trainData.shape[0]*0.8), int(trainData.shape[0])]]
for k in range(2, 3):
    xg_train = xgb.DMatrix(vstack((trainData[:k_fold[k][0]], trainData[k_fold[k][1]:]), format='csr'), 
                           label=np.hstack((age_train.iloc[:k_fold[k][0], 1].values, age_train.iloc[k_fold[k][1]:, 1].values)))
    xg_val = xgb.DMatrix(trainData[k_fold[k][0]:k_fold[k][1]],
                         label=age_train.iloc[k_fold[k][0]:k_fold[k][1], 1].values)
    param = {}
    #GPU
    param['gpu_id'] = 0
    param['n_gpus'] = 1
    param['max_bin'] = 255
    param['grow_policy'] = 'depthwise'
    param['tree_method'] = 'gpu_hist'
    #CPU
#     param['tree_method'] = 'auto'

    param['eta'] = 0.01
#     param['min_child_weight'] = 0.8
    param['subsample'] = 0.8

#     param['lambda'] = 0.6
    param['gamma'] = 0.2
    param['max_depth'] = 9
    param['silent'] = 1
    param['nthread'] = 48
    # param['top_k'] = 1500
    # param['scale_pos_weight'] = 9
#     param['missing'] = -1

    param['num_class'] = 7
    param['objective'] = 'multi:softprob'
    watchlist = [(xg_train, 'train'), (xg_val, 'val')]
    num_round = 50000
    
    bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=500)
    bst.save_model('../model/xgb_13100_'+str(k)+'.model')
    del bst, xg_train, xg_val
    gc.collect()

# stacking Train数据

In [None]:
k_fold = [[0, int(trainData.shape[0]*0.2)],
          [int(trainData.shape[0]*0.2), int(trainData.shape[0]*0.4)], 
          [int(trainData.shape[0]*0.4), int(trainData.shape[0]*0.6)],
          [int(trainData.shape[0]*0.6), int(trainData.shape[0]*0.8)],
          [int(trainData.shape[0]*0.8), int(trainData.shape[0])]]

In [None]:
train_stacking_predict = np.zeros((trainData.shape[0],7))
for k in tqdm(range(5), desc='k-flod loop'):
    bst = xgb.Booster()
    bst.load_model('../model/xgb_13100_'+str(k)+'.model')
    
    xg_val = xgb.DMatrix(trainData[k_fold[k][0]:k_fold[k][1]])
    train_stacking_predict[k_fold[k][0]:k_fold[k][1]] =  bst.predict(xg_val)
    del xg_val, bst
    gc.collect()

In [None]:
np.savetxt('../processed/xgboost_val_13100.txt', train_stacking_predict, fmt='%s', delimiter=',', newline='\n')

In [None]:
train_stacking_predict.shape

# 测试

In [None]:
testData = sparse.load_npz('../trainTestData/testData13100.npz')

In [None]:
pred_prob = np.zeros((testData.shape[0],7))
for k in tqdm(range(5), desc='k-flod loop'):
    bst = xgb.Booster()
    bst.load_model('../model/xgb_13100_'+str(k)+'.model')
    
    xg_test = xgb.DMatrix(testData)
    pred_prob +=  bst.predict(xg_test)
    del xg_test

In [None]:
pred_prob = pred_prob / 5

In [None]:
np.savetxt('../processed/xgboost_test_13100.txt', pred_prob, fmt='%s', delimiter=',', newline='\n')