In [17]:
import threading
import warnings
import time
import gc

import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

dataPath = '../dataSet/'
warnings.filterwarnings('ignore')

In [18]:
train = pd.read_csv(dataPath+'age_train.csv', names=['uid','age'])
test = pd.read_csv(dataPath+'age_test.csv', names=['uid'])

behavior_info = pd.read_csv(dataPath+'user_behavior_info.csv', 
                        names=['uid', 'times', 'A', 'B', 'C', 'D', 'E', 'F', 'G'])

basic_info = pd.read_csv(dataPath+'user_basic_info.csv', 
                        names=['uid','gender','city','phone_type','ram','ram_left','rom','rom_left','color','fontSize','ct','carrier','os'])

app_package = pd.read_csv(dataPath+'user_app_actived.csv', names=['uid','appid'])
app_info = pd.read_csv(dataPath+'app_info.csv', names=['appid','category'])

In [19]:
# For behavior's feature values, do pre-scaling
chapters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

for index in range(len(chapters)):
    scaler = MinMaxScaler(feature_range=(0,1))
    # Scaling all behavior features into (0, 1)
    behavior_info[chapters[index]] = scaler.fit_transform(behavior_info[chapters[index]])

In [20]:
chapters = ['city', 'phone_type', 'color', 'ct','carrier','os']

for index in range(len(chapters)):
    labeler = LabelEncoder()
    mask = ~basic_info[chapters[index]].isnull()
    null_mask = basic_info[chapters[index]].isnull()
    basic_info[chapters[index]][mask] = labeler.fit_transform(basic_info[chapters[index]][mask])
    basic_info[chapters[index]][null_mask] = 0

for index in range(len(chapters)):
    basic_info[chapters[index]] = basic_info[chapters[index]].astype(int)

In [21]:
# Do tfidf-features stacking or not
do_stacking = False

# app_infos translation
def get_str(df):
    res = ''
    for ele in df.split("#"):
        res += ele + ' '
    
    return res

if do_stacking:
    app_package['app_str'] = app_package['appid'].apply(lambda x: get_str(x), 1)

    # Try Tfidf
    tfidf = CountVectorizer()
    train_str_app = pd.merge(train[['uid']], app_package[['uid','app_str']], on='uid', how='left')
    test_str_app = pd.merge(test[['uid']], app_package[['uid','app_str']], on='uid', how='left')
    app_package['app_str'] = tfidf.fit_transform(app_package['app_str'])
    train_app = tfidf.transform(list(train_str_app['app_str'])).tocsr()
    test_app = tfidf.transform(list(test_str_app['app_str'])).tocsr()

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# According to app_features, do stacking
if do_stacking:
    train_feature = train_app
    test_feature = test_app

    df_stack = pd.DataFrame()

    all_id = pd.concat([train[['uid']], test[['uid']]])
    n_folds = 10
    df_stack['uid'] = all_id['uid']

    labels = train['age'] - 1

    print('LR Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=0, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('LR Stacking: %d/%d' % ((i+1), n_folds))
        clf = LogisticRegression(solver='sag', n_jobs=-1, multi_class='multinomial')
        clf.fit(train_feature[tr], labels[tr])
        score_va = clf.predict(train_feature[va])[:,1]
        print(score_va)

        score_te = clf.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], clf.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_lr_classify_{}'.format('age')] = stack[:, 0]

    print('SGD Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('SGD Stacking: %d/%d' % ((i+1), n_folds))
        sgd = SGDClassifier(loss='log', n_jobs=-1)
        sgd.fit(train_feature[tr], labels[tr])
        score_va = sgd.predict(train_feature[va])[:,1]
        print(score_va)

        score_te = sgd.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], sgd.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_sgd_classify_{}'.format('age')] = stack[:, 0]

    print('PAC Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('PAC Stacking: %d/%d' % ((i+1), n_folds))
        pac = PassiveAggressiveClassifier(n_jobs=-1)
        pac.fit(train_feature[tr], labels[tr])
        score_va = pac.predict(train_feature[va])[:,1]

        score_te = pac.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], pac.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_pac_classify_{}'.format('age')] = stack[:, 0]

    print('RidgeClassify Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('Ridge Stacking: %d/%d' % ((i+1), n_folds))
        ridge = RidgeClassifier(solver='sag')
        ridge.fit(train_feature[tr], labels[tr])
        score_va = ridge.predict(train_feature[va])[:,1]

        score_te = ridge.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], ridge.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_ridge_classify_{}'.format('age')] = stack[:, 0]

    print('BernoulliNB Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('BernoulliNB Stacking: %d/%d' % ((i+1), n_folds))
        bnb = BernoulliNB()
        bnb.fit(train_feature[tr], labels[tr])
        score_va = bnb.predict(train_feature[va])[:,1]

        score_te = bnb.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], bnb.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_bnb_classify_{}'.format('age')] = stack[:, 0]

    print('MultinomialNB Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('MultinomialNB Stacking: %d/%d' % ((i+1), n_folds))
        mnb = MultinomialNB()
        mnb.fit(train_feature[tr], labels[tr])
        score_va = mnb.predict(train_feature[va])[:,1]

        score_te = mnb.predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], mnb.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_mnb_classify_{}'.format('age')] = stack[:, 0]
    
    print('LinearSVC Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('LinearSVC Stacking: %d/%d' % ((i+1), n_folds))
        lsvc = LinearSVC()
        lsvc.fit(train_feature[tr], labels[tr])
        score_va = lsvc._predict(train_feature[va])[:,1]

        score_te = lsvc._predict(test_feature)[:,1]
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], lsvc.predict(train_feature[va]))))
        stack_train[va, 0] = score_va
        stack_test[:, 0] += score_te

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_lsvc_classify_{}'.format('age')] = stack[:, 0]

    df_stack.to_csv(dataPath+'tfidf_classfiy.csv', index=None, encoding='utf8')
    print('Tfidf Features Stacking is Done~')

In [23]:
# Calculate user's activated apps number
do_get_length = False

def get_app_len(df):
    return len(df.split('#'))

if do_get_length:
    app_number_feat = pd.DataFrame(columns=['uid', 'app_number'])
    app_number_feat['uid'] = app_package['uid']
    app_number_feat['app_number'] = app_package['appid'].apply(lambda x: get_app_len(x), 1)

    app_number_feat.to_csv(dataPath + 'app_activated_sum.csv', index=False)
else:
    app_number_feat = pd.read_csv(dataPath+'app_activated_sum.csv')

In [24]:
from sklearn.decomposition import LatentDirichletAllocation

do_lda = False

if do_lda:
    apps = app_package['appid'].apply(lambda x: get_str(x), 1)
    vectorizer = CountVectorizer()
    cntTf = vectorizer.fit_transform(apps)
    
    lda = LatentDirichletAllocation(n_topics=10)
    docres = lda.fit_transform(cntTf)
    lda_feat = pd.DataFrame(docres)
    
    lda_feat.to_csv(dataPath + 'lda_feat.csv', index=False)
else:
    lda_feat = pd.read_csv(dataPath+'lda_feat.csv')

In [32]:
tfidf_feat = pd.read_csv(dataPath+'tfidf_classfiy.csv')

packages = app_package

# Merge applist lda
packages = pd.concat([packages, lda_feat], axis=1)
packages = packages.drop('appid', axis=1)

# Combine all trainData features
trainData = pd.merge(train, basic_info, on='uid', how='left')
trainData = pd.merge(trainData, behavior_info, on='uid', how='left')
trainData = pd.merge(trainData, tfidf_feat, on='uid', how='left')
trainData = pd.merge(trainData, packages, on='uid', how='left')
#trainData = pd.merge(trainData, app_number_feat, on='uid', how='left')
trainLabel = trainData['age'] - 1

# Delete with NaN values
for col in trainData.columns:
    mask = trainData[col].isnull()
    trainData[col][mask] = 0

feature_signs = [x for x in trainData.columns if x not in ['uid', 'age']]

In [None]:
# Establish lightGBM to check K-Fold score
params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'learning_rate': 0.08,
    'num_class': 6,
    'nthread': 8
}

kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
for index, (trainIndex, testIndex) in enumerate(kfold.split(trainData, trainLabel)):
    tr_x = trainData[feature_signs].reindex(index=trainIndex, copy=False)
    tr_y = trainLabel[trainIndex]
    te_x = trainData[feature_signs].reindex(index=testIndex, copy=False)
    te_y = trainLabel[testIndex]
    
    trainDataSet = lgb.Dataset(tr_x, label=tr_y)
    testDataSet = lgb.Dataset(te_x, label=te_y)
    
    model = lgb.train(params, trainDataSet, num_boost_round=5000,
                      valid_sets=testDataSet, verbose_eval=100, early_stopping_rounds=200)
    
    prediction = model.predict(te_x, num_iteration=model.best_iteration)
    
    # Deal with float vals in prediction
    pred = []
    for ele in prediction:
        pred.append(ele.tolist().index(max(ele)) + 1)
    
    accuracy = accuracy_score(te_y+1, pred)
    loss = mean_squared_error(te_y+1, pred)
    
    print('KFold Iteration: %d' % index)
    print('Accuracy: %.5f' % accuracy)
    print('Loss: %.5f' % loss)

Training until validation scores don't improve for 200 rounds.


In [None]:
# Establish lightGBM to check K-Fold score
params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'learning_rate': 0.08,
    'num_class': 6,
    'nthread': 8
}

# Finally create lightGBM model to predict test labels
# Combine all trainData features
testData = pd.merge(test, basic_info, on='uid', how='left')
testData = pd.merge(testData, behavior_info, on='uid', how='left')
testData = pd.merge(testData, tfidf_feat, on='uid', how='left')
testData = pd.merge(testData, app_package, on='uid', how='left')
testData = pd.merge(testData, app_number_feat, on='uid', how='left')

# Delete with NaN values
for col in testData.columns:
    mask = testData[col].isnull()
    testData[col][mask] = 0
    
tr_x = trainData[feature_signs]
tr_y = trainLabel
te_x = testData[feature_signs]

trainDataSet = lgb.Dataset(tr_x, label=tr_y)
testDataSet = lgb.Dataset(tr_x, label=tr_y)

model = lgb.train(params, trainDataSet, num_boost_round=5000,
                      valid_sets=testDataSet, verbose_eval=100, early_stopping_rounds=200)
    
prediction = model.predict(te_x, num_iteration=model.best_iteration)

# Deal with float vals in prediction
pred = []
for ele in prediction:
    pred.append(ele.tolist().index(max(ele)) + 1)

test['age'] = pred

In [None]:
test.to_csv('../result/submission.csv', header=['id', 'label'], index=False)