In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
import lightgbm as lgb
import scipy
from scipy import sparse
from pandas.core.common import SettingWithCopyWarning
import scipy.stats as sp
import pandas as pd
import numpy as np
from collections import Counter
import warnings
import time
import sys
import random
import os
import gc
import datetime



seed = random.randint(2000, 3000)


class Unbuffered(object):
    def __init__(self, stream):
        self.stream = stream

    def write(self, data):
        self.stream.write(data)
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)


sys.stdout = Unbuffered(sys.stdout)



path = '../data/'
data_path = '../trainTestData/'
middle_path = '../processed/'
train = pd.read_csv(path + 'age_train.csv', names=['uid', 'label'])
test = pd.read_csv(path + 'age_test.csv', names=['uid'])

col_1 = ['uid', 'gender', 'city', 'prodName',
              'ramCapacity', 'ramLeftRation',
              'romCapacity', 'romLeftRation',
              'color', 'fontSize', 'ct',
              'carrier', 'os']
user_basic_info = pd.read_csv(path + 'user_basic_info.csv', names=col_1).fillna(-1)



user_basic_info_category = ['gender', 'city', 'prodName',
                          'ramCapacity', 'romCapacity',
                          'color', 'ct', 'carrier', 'os']

for col in user_basic_info_category:
    user_basic_info[col] = user_basic_info[col].map(
        dict(zip(user_basic_info[col].unique(), range(0, user_basic_info[col].nunique()))))

user_basic_info['ram_interaction'] = user_basic_info['ramCapacity'] * user_basic_info['ramLeftRation']
user_basic_info['rom_interaction'] = user_basic_info['romCapacity'] * user_basic_info['romLeftRation']

train = train.merge(user_basic_info, on='uid', how='left')
test = test.merge(user_basic_info, on='uid', how='left')
print('merged user_basic_info finished!')



col_2 = ['uid', 'bootTimes', 'AFuncTimes',
              'BFuncTimes', 'CFuncTimes', 'DFuncTimes',
              'EFuncTimes', 'FFuncTimes', 'GFuncTimes']
user_behavior_info = pd.read_csv(path + 'user_behavior_info.csv', names=col_2)


train = train.merge(user_behavior_info, on='uid', how='left')
test = test.merge(user_behavior_info, on='uid', how='left')

del user_behavior_info
gc.collect()
print('merge user_behavior_info finished!')


col_3 = ['uid', 'appid']
user_app_actived = pd.read_csv(path + 'user_app_actived.csv', names=col_3)

user_app_actived['appid'] = user_app_actived['appid'].apply(
    lambda x: ','.join(x.split('#')))


train = train.merge(user_app_actived, on='uid', how='left')
test = test.merge(user_app_actived, on='uid', how='left')

print('merged user_app_actived finished!')


app_info = pd.read_csv(path + 'app_info.csv',names=['appid', 'app_category'])

user_app_actived['appid_array'] = user_app_actived['appid'].apply(lambda x: x.split(','))
user_app_actived['app_count'] = user_app_actived['appid_array'].apply(lambda x: len(x))
user_app_actived['appid_array'] = user_app_actived['appid_array'].apply(lambda x: np.array(x))

user_app_actived['uid_array'] = user_app_actived['uid'].apply(lambda x: [x])
user_app_actived['uid_array'] = user_app_actived['uid_array'] * user_app_actived['app_count']
user_app_actived['uid_array'] = user_app_actived['uid_array'].apply(lambda x: np.array(x))

user_app_actived_agg = pd.DataFrame()
print(np.concatenate(user_app_actived['uid_array'].values).shape)
print(np.concatenate(user_app_actived['appid_array'].values).shape)
user_app_actived_agg['uid'] = np.concatenate(user_app_actived['uid_array'].values)
user_app_actived_agg['appid'] = np.concatenate(user_app_actived['appid_array'].values)

#### count ####
train = train.merge(user_app_actived[['uid', 'app_count']], on='uid', how='left')
test = test.merge(user_app_actived[['uid', 'app_count']], on='uid', how='left')
gc.collect()

user_app_actived_agg = user_app_actived_agg.merge(app_info, on='appid', how='left')
print('merged app_info finished!')



col_4 = ['uid', 'appid','duration', 'times','use_date']
user_app_usage = pd.read_csv(path + 'user_app_usage.csv', names=col_4)
user_app_usage['duration_per_time'] = user_app_usage['duration'] / (user_app_usage['times'] + 1)

for col in ['duration', 'times', 'duration_per_time']:
    new_dict = {'sum': col + '_of_user_sum',
                  'mean': col + '_of_user_mean',
                  'min': col + '_of_user_min',
                  'max': col + '_of_user_max',
                  'std': col + '_of_user_std'}

    temp = user_app_usage.groupby(['uid'])[col].agg(['sum', 'mean','min', 'max','std']).reset_index().rename(columns=new_dict)
    train = train.merge(temp, on='uid', how='left')
    test = test.merge(temp, on='uid', how='left')
    del temp
    gc.collect()

appid_list = user_app_usage['appid'].value_counts(sort=True).iloc[:5000].index.tolist()
user_app_usage = user_app_usage.loc[user_app_usage['appid'].isin(appid_list), :]

temp = user_app_usage.groupby(['uid', 'appid'])['duration'].agg(['sum']).reset_index()
temp = temp.pivot(index='uid', columns='appid', values='sum').fillna(0)
usage_sum_col = ['sum_duration_of_' + col for col in temp.columns]
temp.columns = usage_sum_col
temp.reset_index(inplace=True)

train = train.merge(temp, on='uid', how='left')
test = test.merge(temp, on='uid', how='left')
del temp

gc.collect()

del user_app_usage
gc.collect()

print('merged user_app_usage finished!')


feats = [col for col in train.columns if col not in ['uid', 'label']]
print('total_feats number:{}'.format(len(feats)))
cat_feats = user_basic_info_category
print('cat_feats number:{}'.format(len(cat_feats)))
num_feats = [col for col in feats if col not in cat_feats + ['appid']]
print('num_feats number:{}'.format(len(num_feats)))


target = pd.DataFrame()
target['label'] = train['label']
target.to_csv(path + 'target.csv', index=False)

stacking = pd.DataFrame()
stacking["uid"] = train["uid"]

test_id = pd.DataFrame()
test_id['uid'] = test[['uid']]
test_id.to_csv(path + 'test_id.csv', index=False)

train['label'] = (train['label'] - 1).astype(int)

train_x = train[feats].fillna(0)
train_y = train['label'].values
test_x = test[feats].fillna(0)

sub = test[['uid']]
sub.columns = ['id']


train_csr_1 = sparse.csr_matrix((len(train_x), 0))
test_csr_1 = sparse.csr_matrix((len(test_x), 0))

train_num = train.shape[0]
test_num = test.shape[0]
del train
del test
gc.collect()

oht = OneHotEncoder()
for col in cat_feats:
    if col in user_basic_info.columns:
        oht.fit(user_basic_info[col].values.reshape(-1, 1))
    else:
        continue
    train_csr_1 = sparse.hstack((train_csr_1, oht.transform(train_x[col].values.reshape(-1, 1))), 'csr','bool')
    test_csr_1 = sparse.hstack((test_csr_1, oht.transform(test_x[col].values.reshape(-1, 1))), 'csr', 'bool')
print('one-hot finished !')

cv = CountVectorizer(min_df=20)
for col in ['appid']:
    cv.fit(user_app_actived['appid'])
    train_csr_1 = sparse.hstack((train_csr_1, cv.transform(train_x[col])), 'csr', 'bool')
    test_csr_1 = sparse.hstack((test_csr_1, cv.transform(test_x[col])), 'csr', 'bool')
print('cv finished !')

train_csr = sparse.hstack((sparse.csr_matrix(train_x[num_feats]), train_csr_1), 'csr').astype('float32')
test_csr = sparse.hstack((sparse.csr_matrix(test_x[num_feats]), test_csr_1), 'csr').astype('float32')

gc.collect()
scipy.sparse.save_npz(data_path + 'trainData15112.npz', train_csr)
scipy.sparse.save_npz(data_path + 'testData15112.npz', test_csr)