# 用户人口属性预测
手机用户的人口属性（如性别、年龄、常驻地等）数据一方面可以被用于个性化推荐服务，提升用户体验，另一方面可以用于手机用户群画像分析，帮助厂商了解产品的人群定位，优化产品设计。

In [1]:
###########################
#   -*- coding: utf-8 -*- #
# PHM_data_challenge_2019 #
#     Author: Huet Zhu    #
#       Date:2019.5       #
#    All Rights Reserved  #
###########################

# # 基于机器学习的飞控系统故障诊断方案设计
from __future__ import division

import numpy as np
from catboost import CatBoostClassifier, Pool
import pickle
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib

from sklearn.tree import DecisionTreeRegressor  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np  

from tqdm import tqdm

import gc

from scipy import sparse
from scipy.sparse import csr_matrix, hstack, vstack

import warnings
import random
warnings.filterwarnings('ignore')

In [2]:
age_test = pd.read_csv("../data/age_test.csv", header = None)
age_train = pd.read_csv("../data/age_train.csv", header = None)

In [4]:
csr_trainData = hstack((sparse.load_npz('../trainTestData/trainData13100.npz'), sparse.load_npz('../trainTestData/trainData15112.npz')), format='csr')
csr_trainData.shape

(2010000, 28212)

In [5]:
csr_trainData.shape

(2010000, 28212)

In [None]:
# trainData = pd.DataFrame(csr_trainData.toarray(), dtype = np.float16)
trainData = csr_trainData.toarray()
label = age_train.iloc[:, 1].values
del csr_trainData
gc.collect()

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=812, shuffle=True)

In [None]:
for k, (train_index, test_index) in enumerate(skf.split(trainData, label)):
    # Initialize CatBoostClassifier
    model = CatBoostClassifier(iterations=500000,
                               task_type = "GPU",
                               learning_rate=0.01,
                               depth=7,
                               verbose=20,
                               early_stopping_rounds = 500,
                               loss_function='MultiClass',
                               custom_metric=['MultiClass',
                                              'Accuracy'])
    print(k, 'Fit model')

    model.fit(trainData[train_index],
              y=label[train_index],
              eval_set = (trainData[test_index], label[test_index]),
              plot=True)
    gc.collect()
    print(model.get_best_score())
    model.save_model('../model/catboost_28212_'+str(k)+'.model', 
                       format="cbm", 
                       export_parameters=None,
                       pool=None)
    del model
    gc.collect()

# stacking数据处理准备

In [None]:
k_fold = [[0, int(trainData.shape[0]*0.2)],
          [int(trainData.shape[0]*0.2), int(trainData.shape[0]*0.4)], 
          [int(trainData.shape[0]*0.4), int(trainData.shape[0]*0.6)],
          [int(trainData.shape[0]*0.6), int(trainData.shape[0]*0.8)],
          [int(trainData.shape[0]*0.8), int(trainData.shape[0])]]
train_index = np.arange(trainData.shape[0])

In [None]:
val_prob = np.zeros((trainData.shape[0], 6))
for k in tqdm(range(5)):
    model = CatBoostClassifier()
    model.load_model('../model/catboost_28212_'+str(k)+'.model', format='cbm')
    val_prob[k_fold[k][0]:k_fold[k][1]] = model.predict_proba(trainData[k_fold[k][0]:k_fold[k][1]])
    del model
    gc.collect()

In [None]:
np.sum(val_prob), trainData.shape[0]

In [None]:
train_stacking_predict = val_prob

In [None]:
np.sum(train_stacking_predict)

In [None]:
np.savetxt('../processed/stacking/catboost_val_28212.txt', train_stacking_predict, fmt='%s', delimiter=',', newline='\n')

# 测试

In [None]:
csr_testData = hstack((sparse.load_npz('../trainTestData/testData13100.npz'), sparse.load_npz('../trainTestData/testData15112.npz')), format='csr')
csr_testData.shape

In [None]:
testData = csr_testData.toarray()
del csr_testData
gc.collect()
testData.shape

In [None]:
test_prob = np.zeros((testData.shape[0], 6))
for k in tqdm(range(2)):
    model = CatBoostClassifier()
    model.load_model('../model/catboost_28212_'+str(k)+'.model', format='cbm')
    test_prob += model.predict_proba(testData, thread_count=40, verbose=True)
    del model
    gc.collect()

In [None]:
np.sum(test_prob)/2

In [None]:
test_stacking_predict = test_prob / 2

In [None]:
np.savetxt('../processed/catboost_test_28212.txt', test_stacking_predict, fmt='%s', delimiter=',', newline='\n')